def __init__(self, **mpe_args): """Create a new Multi-Agent Particle env compatible with RLlib. Arguments: mpe_args (dict): Arguments to pass to the underlying make_env.make_env instance. Examples: from rllib_env import RLlibMultiAgentParticleEnv env = RLlibMultiAgentParticleEnv(scenario_name="simple_reference") print(env.reset()) """ self._env = make_env(**mpe_args) self.num_agents = self._env.n self.agent_ids = list(range(self.num_agents)) self.observation_space_dict = self._make_dict( self._env.observation_space) self.action_space_dict = self._make_dict(self._env.action_space)
def train(arglist): """ Run MADDPG algorithm using passed in commandline arguments Args: arglist (argparse.Namespace): Parsed commandline arguments object """ tf.reset_default_graph() if arglist.seed is not None: np.random.seed(arglist.seed) tf.set_random_seed(arglist.seed) with tf_util.make_session(config=None, num_cpu=1, make_default=False, graph=None): # with tf_util.single_threaded_session(): ########################################### # Create environment # ########################################### env = make_env(arglist.scenario, arglist=arglist, done=arglist.done_callback, logging=arglist.logging, benchmark=arglist.benchmark) ########################################### # Create agent trainers # ########################################### obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print("Number of Adversaries: {}".format(num_adversaries)) print('Experiment: {}. Using good policy {} and adv policy {}'.format( arglist.exp_name, arglist.good_policy, arglist.adv_policy)) ########################################### # Initialize # ########################################### tf_util.initialize() ########################################### # Load previous results, if necessary # ########################################### if arglist.load_dir == "": arglist.load_dir = arglist.save_dir # if arglist.display or arglist.restore or arglist.benchmark or arglist.load_dir is not None: if arglist.restore or arglist.benchmark or arglist.load_dir is not None: print('Loading previous state...') # Set model file if arglist.model_file == "": arglist.model_file = arglist.exp_name print("Model File: " + arglist.load_dir + arglist.model_file) tf_util.load_state(arglist.load_dir + arglist.model_file) ########################################### # Create the save directory # ########################################### if not os.path.exists(arglist.save_dir): os.makedirs(arglist.save_dir, exist_ok=True) if not os.path.exists(arglist.plots_dir): os.makedirs(arglist.plots_dir, exist_ok=True) ########################################### # Set parameters # ########################################### # Sum of rewards for all agents episode_rewards = [0.0] # This was changed so that a reward can be tracked for fixed policy agents as well as learning agents # Individual agent reward # agent_rewards = [[0.0] for _ in range(env.n)] agent_rewards = [[0.0] for _ in range(len(env.world.agents))] # Retrieve previous episode count try: prev_ep_ct = int(arglist.model_file.split("_")[-1]) except ValueError: print("Starting from untrained network...") prev_ep_ct = 0 ep_ct = prev_ep_ct + arglist.num_episodes # Sum of rewards for training curve final_ep_rewards = [] # Agent rewards for training curve final_ep_ag_rewards = [] # Placeholder for benchmarking info agent_info = [[[]]] saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() progress = False # Save more often if you have fewer episodes arglist.save_rate = min(arglist.save_rate, arglist.num_episodes) # Initialize loss file for each agent if arglist.log_loss: for i in range(len(env.world.agents)): log_loss(arglist, ep_ct, "agent_{}".format(i), initialize=True) ########################################### # Start # ########################################### print('Starting iterations...') while True: # TODO: Switch to is isinstance() # if type(env.world.scripted_agents[0].action) == type(None): # print("Error") # Get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # Environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) # Logging step if arglist.logging: env.log( len(episode_rewards) + prev_ep_ct, episode_step, new_obs_n, rew_n, done_n, info_n) # Update information episode_step += 1 # Check if all agents are done # done = all(done_n) # Check if any agents are done done = any(done_n) terminal = (episode_step >= arglist.max_episode_len) # Collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew # For displaying learned policies if arglist.display: time.sleep(0.1) env.render() if done or terminal: print('Episode Reward: {}'.format( [rew[-1] for rew in agent_rewards])) time.sleep(0.5) obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) continue if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # Increment global step counter train_step += 1 # For benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # In testing mode, don't perform model updates if arglist.testing: if len(episode_rewards) > arglist.num_episodes: print("episodes: {}, " "mean episode reward: {}, time: {}".format( len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) env.logger.save("State", arglist.save_dir, filename=arglist.exp_name + '_state' + '_' + str(prev_ep_ct) + arglist.log_append) break continue # Update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for i, agent in enumerate(trainers): loss = agent.update(trainers, train_step) if arglist.log_loss and loss is not None: log_loss(arglist, ep_ct, "agent_{}".format(i), loss=loss[1]) if len(episode_rewards) % 100 == 0 and progress: print("Episode {} Reached. Time: {}".format( len(episode_rewards), time.time() - t_start)) progress = False elif len(episode_rewards) % 100 != 0 and not progress: progress = True # Save model, display training output if (terminal or done) and (len(episode_rewards) % arglist.save_rate == 0): # TODO: Implement some checks so that we don't overwrite old networks unintentionally? # Save model state tf_util.save_state(arglist.save_dir + arglist.exp_name + '_' + str(len(episode_rewards) + prev_ep_ct), saver=saver) # Print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards) + prev_ep_ct, np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards) + prev_ep_ct, np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(reward[-arglist.save_rate:]) for reward in agent_rewards ], round(time.time() - t_start, 3))) # Reset start time to current time t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for reward in agent_rewards: final_ep_ag_rewards.append( np.mean(reward[-arglist.save_rate:])) # Saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) # Log agent data for run env.logger.save("State", arglist.save_dir, filename=arglist.exp_name + '_state' + '_' + str(len(episode_rewards) + prev_ep_ct)) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): """ Run MADDPG algorithm using passed in commandline arguments Args: arglist (argparse.Namespace): Parsed commandline arguments object """ # Assign roles training_role = arglist.training_role[0] if arglist.training_role[0] == "defender": opponent_role = "attacker" opponent_index = 0 elif arglist.training_role[0] == "attacker": opponent_role = "defender" opponent_index = 1 else: raise Exception("training role error!") # suppress tensorflow warnings tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) tf.reset_default_graph() if arglist.seed is not None: np.random.seed(arglist.seed) tf.set_random_seed(arglist.seed) with tf_util.make_session(config=None, num_cpu=1, make_default=False, graph=None): # with tf_util.single_threaded_session(): ########################################### # Create environment # ########################################### env = make_env(arglist.scenario, arglist=arglist, done=arglist.done_callback, logging=arglist.logging, benchmark=arglist.benchmark) ########################################### # Create agent trainers # ########################################### obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print("Training super {} against level 0 to {} opponent." .format(arglist.training_role[0], arglist.level)) print("Number of Adversaries: {}".format(num_adversaries)) print('Experiment: {}. Using good policy {} and adv policy {}'.format(arglist.exp_name, arglist.good_policy, arglist.adv_policy)) ########################################### # Initialize # ########################################### tf_util.initialize() ########################################### # Load previous results, if necessary # ########################################### if arglist.load_dir == "": arglist.load_dir = arglist.save_dir # if arglist.display or arglist.restore or arglist.benchmark or arglist.load_dir is not None: if ((arglist.restore or arglist.load_dir is not None) and arglist.level != 0) or arglist.benchmark: print('Loading previous state...') print("Level-k folder: " + arglist.load_dir) for opp_level in range(0, arglist.level + 1): opp_model_file = "level_{}_{}".format(opp_level, opponent_role) tf_util.load_state(fname=arglist.load_dir + opp_model_file, var_prefix="level_{}_{}_{}".format(opp_level, opponent_role, opponent_index)) ########################################### # Create the save directory # ########################################### if not os.path.exists(arglist.save_dir): os.makedirs(arglist.save_dir, exist_ok=True) if not os.path.exists(arglist.plots_dir): os.makedirs(arglist.plots_dir, exist_ok=True) ########################################### # Set parameters # ########################################### # Sum of rewards for all agents episode_rewards = [0.0] # This was changed so that a reward can be tracked for fixed policy agents as well as learning agents # Individual agent reward # agent_rewards = [[0.0] for _ in range(env.n)] agent_rewards = [[0.0] for _ in range(len(env.world.agents))] # Retrieve previous episode count try: prev_ep_ct = int(arglist.model_file.split("_")[-1]) except ValueError: print("Starting from untrained network...") prev_ep_ct = 0 ep_ct = prev_ep_ct + arglist.num_episodes # Sum of rewards for training curve final_ep_rewards = [] # Agent rewards for training curve final_ep_ag_rewards = [] # Placeholder for benchmarking info agent_info = [[[]]] saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() progress = False # Save more often if you have fewer episodes arglist.save_rate = min(arglist.save_rate, arglist.num_episodes) # Initialize loss file for each agent if arglist.log_loss: for i in range(len(env.world.agents)): log_loss(arglist, ep_ct, "agent_{}".format(i), initialize=True) ########################################### # Start # ########################################### print('Starting iterations...') # Initialize opponent selection distribution to uniform p_opponent_selection = np.ones(arglist.level + 1) / (arglist.level + 1) # Initialize evaluate_flag evaluate_flag = False evaluation_done = False # initialize worst performing level list worst_performing_levels = [] while True: # TODO: Switch to is isinstance() # if type(env.world.scripted_agents[0].action) == type(None): # print("Error") # Get opponent and training agents' indices good_update_index, opponent_select_index, opponent_select_level = get_update_indices(training_role, opponent_role, p_opponent_selection, arglist) updating_indices = [good_update_index] # Get action good_trainer = trainers[good_update_index] opp_trainer = trainers[opponent_select_index] if good_update_index > opponent_select_index: selected_trainers = [opp_trainer, good_trainer] elif good_update_index < opponent_select_index: selected_trainers = [good_trainer, opp_trainer] else: raise Exception("Trainer index selection error!") action_n = [agent.action(obs) for agent, obs in zip(selected_trainers, obs_n)] # Environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) # Logging step if arglist.logging: env.log(len(episode_rewards) + prev_ep_ct, episode_step, new_obs_n, rew_n, done_n, info_n) # Update information episode_step += 1 # Check if all agents are done # done = all(done_n) # Check if any agents are done done = any(done_n) terminal = (episode_step >= arglist.max_episode_len) # Collect experience for i, agent in enumerate(selected_trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew # For displaying learned policies if arglist.display: time.sleep(0.1) env.render() if done or terminal: print('Episode Reward: {}'.format([rew[-1] for rew in agent_rewards])) time.sleep(0.5) obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) continue if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # Increment global step counter train_step += 1 # For benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue ''' # For displaying learned policies if arglist.display: time.sleep(0.1) env.render() # print("Mean Episode Reward: {}".format([np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards])) continue ''' # In testing mode, don't perform model updates if arglist.testing: if len(episode_rewards) > arglist.num_episodes: print("episodes: {}, " "mean episode reward: {}, time: {}".format(len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) env.logger.save("State", arglist.save_dir, filename=arglist.exp_name + '_state' + '_' + str(prev_ep_ct) + arglist.log_append) break continue # Test current super agent's performance against all other agents # Check if all level has been evaluated and update p_select if (terminal or done) and evaluate_flag and evaluation_done: evaluate_flag = False level_performances = level_performances / arglist.evaluate_length np.set_printoptions(precision=2) print("Evaluation complete, against level 0 to {} performances: {}".format(arglist.level, level_performances)) worst_level = np.argmin(level_performances) worst_performing_levels.append(worst_level) print("Worst performing level is {}".format(worst_level)) # update p_select #TODO: check some other distributions # p_opponent_selection = np.ones(arglist.level + 1) * 0.6 / arglist.level p_opponent_selection[worst_level] = 1 print("Opponent selection probability set to: {}".format(p_opponent_selection)) # Pop evaluation list and update current evaluate level if (terminal or done) and evaluate_flag: last_episode_agent_reward = agent_rewards[get_role_index(arglist.training_role[0])][-2] level_performances[evaluate_level] += last_episode_agent_reward if len(evaluate_levels) == 0: evaluation_done = True else: evaluate_level = evaluate_levels.pop(0) # get the level to evaluate next # set up p_selection distribution p_opponent_selection = np.zeros(arglist.level + 1) p_opponent_selection[evaluate_level] = 1 # set up evaluate schedules if (terminal or done) and (len(episode_rewards) % arglist.evaluate_rate == 0): print("Freezing current super-agent's network and performing evaluation.") evaluate_flag = True evaluation_done = False eval_len = arglist.evaluate_length evaluate_levels = [] level_performances = np.zeros(arglist.level + 1) for level in range(arglist.level + 1): for i in range(eval_len): evaluate_levels.append(level) evaluate_level = evaluate_levels.pop(0) # In evaluate mode, don't perform model updates if evaluate_flag: continue # If not in display or benchmark mode, update trainers with index in updating_indices. loss = None for i, agent in enumerate(trainers): if i in updating_indices: agent.preupdate() for i, agent in enumerate(trainers): if i in updating_indices: loss = agent.update(selected_trainers, train_step) if arglist.log_loss and loss is not None: log_loss(arglist, ep_ct, "agent_{}".format(i), loss=loss[1]) if len(episode_rewards) % 100 == 0 and progress: print("Episode {} Reached. Time: {:.2f} s".format(len(episode_rewards), time.time() - t_start)) progress = False elif len(episode_rewards) % 100 != 0 and not progress: progress = True # Save model, display training output if (terminal or done) and (len(episode_rewards) % arglist.save_rate == 0): # TODO: Implement some checks so that we don't overwrite old networks unintentionally? # Save model state tf_util.save_state(arglist.save_dir + arglist.exp_name + '_' + str(len(episode_rewards) + prev_ep_ct), saver=saver) # Print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( train_step, len(episode_rewards) + prev_ep_ct, np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, len(episode_rewards) + prev_ep_ct, np.mean(episode_rewards[-arglist.save_rate:]), [np.mean(reward[-arglist.save_rate:]) for reward in agent_rewards], round(time.time() - t_start, 3))) if arglist.level_k_select_print: print("Opponent selection probability: {}".format(p_opponent_selection)) # Reset start time to current time t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) for reward in agent_rewards: final_ep_ag_rewards.append(np.mean(reward[-arglist.save_rate:])) # Pickle dump trainning curve info if not os.path.exists(arglist.plots_dir): os.makedirs(arglist.plots_dir) rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) worst_level_file_name = arglist.plots_dir + arglist.exp_name + '_worst_performing_level.pkl' with open(worst_level_file_name, 'wb') as fp: pickle.dump(worst_performing_levels, fp) # Saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: if not os.path.exists(arglist.plots_dir): os.makedirs(arglist.plots_dir) rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) # Log agent data for run env.logger.save("State", arglist.save_dir, filename=arglist.exp_name + '_state' + '_' + str(len(episode_rewards) + prev_ep_ct)) print('...Finished total of {} episodes.'.format(len(episode_rewards))) print('...Worst performing history: {}'.format(worst_performing_levels)) break