def test_pettingzoo_env(self): register_env("simple_spread", lambda _: PettingZooEnv(simple_spread_v2.env())) env = PettingZooEnv(simple_spread_v2.env()) observation_space = env.observation_space action_space = env.action_space del env agent_class = get_trainer_class("PPO") config = deepcopy(agent_class.get_default_config()) config["multiagent"] = { # Set of policy IDs (by default, will use Trainer's # default policy class, the env's obs/act spaces and config={}). "policies": { "av": (None, observation_space, action_space, {}) }, # Mapping function that always returns "av" as policy ID to use # (for any agent). "policy_mapping_fn": lambda agent_id, episode, **kwargs: "av", } config["log_level"] = "DEBUG" config["num_workers"] = 0 config["rollout_fragment_length"] = 30 config["train_batch_size"] = 200 config["horizon"] = 200 # After n steps, force reset simulation config["no_done_at_end"] = False agent = agent_class(env="simple_spread", config=config) agent.train()
def test_pettingzoo_env(self): register_env("simple_spread", lambda _: PettingZooEnv(simple_spread_v2.env())) agent_class = get_agent_class("PPO") config = deepcopy(agent_class._default_config) test_env = PettingZooEnv(simple_spread_v2.env()) obs_space = test_env.observation_space act_space = test_env.action_space test_env.close() config["multiagent"] = { "policies": { # the first tuple value is None -> uses default policy "av": (None, obs_space, act_space, {}), }, "policy_mapping_fn": lambda agent_id: "av" } config["log_level"] = "DEBUG" config["num_workers"] = 0 config["rollout_fragment_length"] = 30 config["train_batch_size"] = 200 config["horizon"] = 200 # After n steps, force reset simulation config["no_done_at_end"] = False agent = agent_class(env="simple_spread", config=config) agent.train()
def __init__(self, name, max_step=None): if name == "simple_spread": self.env = simple_spread_v2.env() elif name == "waterworld": self.env = waterworld_v3.env() elif name == "multiwalker": self.env = multiwalker_v6.env() else: assert AssertionError, "wrong env name." self.max_step = max_step self.curr_step = 0 self.name = name self.agents = self.env.possible_agents self.env.reset()
def train(arglist): with U.single_threaded_session(): if(arglist.scenario == "Uno"): from pettingzoo.classic import uno_v1 env = uno_v1.env(opponents_hand_visible=False) elif(arglist.scenario == "Texas"): from pettingzoo.classic import texas_holdem_no_limit_v1 env = texas_holdem_no_limit_v1.env() elif(arglist.scenario == "Leduc"): from pettingzoo.classic import leduc_holdem_v1 env = leduc_holdem_v1.env() elif(arglist.scenario == "Limit"): from pettingzoo.classic import texas_holdem_v1 env = texas_holdem_v1.env() elif(arglist.scenario == "Backgammon"): from pettingzoo.classic import backgammon_v1 env = backgammon_v1.env() elif(arglist.scenario == "Adversary"): from pettingzoo.mpe import simple_adversary_v2 env = simple_adversary_v2.env(N=2, max_cycles=25) elif(arglist.scenario == "Crypto"): from pettingzoo.mpe import simple_crypto_v2 env = simple_crypto_v2.env(max_cycles=25) elif(arglist.scenario == "Spread"): from pettingzoo.mpe import simple_spread_v2 env = simple_spread_v2.env(N=3, local_ratio=0.5, max_cycles=25) elif(arglist.scenario == "SpeakerListener"): from pettingzoo.mpe import simple_speaker_listener_v3 env = simple_speaker_listener_v3.env(max_cycles=25) elif(arglist.scenario == "WorldCom"): from pettingzoo.mpe import simple_world_comm_v2 env = simple_world_comm_v2.env(num_good=2, num_adversaries=3, num_obstacles=1, num_food=2, max_cycles=25) else: print("no scenario found") assert(False) obs_shape_n = [] for player, space in env.observation_spaces.items(): val = np.product(space.shape) obs_shape_n.append(tuple((val,))) num_players = len(env.observation_spaces) print("Playing with: ", num_players, " players") num_adversaries = min(num_players, arglist.num_adversaries) # Create agent trainers trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {}'.format(arglist.good_policy)) # Initialize U.initialize() episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(num_players)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() env.reset() episode_step = 0 train_step = 0 t_start = time.time() current_agent_index = 0 num_adversaries = arglist.num_adversaries print('Starting iterations...') while True: agent = env.agents[current_agent_index] trainer = trainers[current_agent_index] player_key = env.agent_selection obs = env.observe(agent=agent).flatten() action_probability = trainer.action(obs) print("action_probability: ", action_probability) action = np.random.choice(a=np.linspace(0,len(action_probability)-1, num=len(action_probability), dtype=int), size=1, p=action_probability)[0] print(action) print(env.observe(agent).flatten()) print(env.rewards) print(env.dones) print(env.infos) obs_n = env.observe(agent).flatten() env.step(action) new_obs_n, rew_n, done_n, info_n = env.observe(agent).flatten(), env.rewards, env.dones, env.infos player_info = info_n.get(player_key) print(action) print(env.observe(agent).flatten()) print(env.rewards) print(env.dones) print(env.infos) rew_array = rew_n.values() episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # False print(action) print(env.observation_spaces) print(env.observe(agent=agent).flatten()) assert(False) # experience(self, obs, act, mask, rew, new_obs, done) trainer.experience(obs_n, action_probability, None, [rew_n.get(player_key)], new_obs_n, done_n.get(player_key)) for i, rew in enumerate(rew_array): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) if(loss is not None and agent.sleep_regimen and agent.agent_mic != 0 and train_step % 100 == 0): # Change sleep frequency here if desired original_policy_loss = loss[1] new_loss = agent.update(trainers, train_step, sleeping=True)[1] sleep_iteration = 0 while((sleep_iteration < 10) and (new_loss < original_policy_loss * 1.05)): new_loss = agent.update(trainers, train_step, sleeping=True)[1] sleep_iteration += 1 #print("sleep walking") # save model, display training output if done and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3))) else: print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: print(arglist.plots_dir) print(arglist.exp_name) rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) break current_agent_index += 1 if(current_agent_index > num_players - 1): current_agent_index = 0