def test_variable_spaces(self): env = MultiagentPettingZooEnv(simple_world_comm_v2.env(), name="simple_world_comm_v2", device='cpu') state = env.reset() # tests that action spaces work for agent in env.agents: state = env.last() self.assertTrue(env.observation_spaces[agent].contains( state['observation'].cpu().detach().numpy())) env.step(env.action_spaces[env.agent_selection].sample())
def test_all(): NUM_ENVS = 5 NUM_CPUS = 2 def test_vec_env(vec_env): vec_env.reset() obs, rew, agent_done, env_done, agent_passes, infos = vec_env.last() print(np.asarray(obs).shape) assert len(obs) == NUM_ENVS act_space = vec_env.action_spaces[vec_env.agent_selection] assert np.all(np.equal(obs, vec_env.observe(vec_env.agent_selection))) assert len(vec_env.observe(vec_env.agent_selection)) == NUM_ENVS vec_env.step([act_space.sample() for _ in range(NUM_ENVS)]) obs, rew, agent_done, env_done, agent_passes, infos = vec_env.last(observe=False) assert obs is None def test_infos(vec_env): vec_env.reset() infos = vec_env.infos[vec_env.agent_selection] assert infos[1]["legal_moves"] def test_seed(vec_env): vec_env.seed(4) def test_some_done(vec_env): vec_env.reset() act_space = vec_env.action_spaces[vec_env.agent_selection] assert not any(done for dones in vec_env.dones.values() for done in dones) vec_env.step([act_space.sample() for _ in range(NUM_ENVS)]) assert any(done for dones in vec_env.dones.values() for done in dones) assert any(rew != 0 for rews in vec_env.rewards.values() for rew in rews) def select_action(vec_env, passes, i): my_info = vec_env.infos[vec_env.agent_selection][i] if False and not passes[i] and "legal_moves" in my_info: return random.choice(my_info["legal_moves"]) else: act_space = vec_env.action_spaces[vec_env.agent_selection] return act_space.sample() for num_cpus in [0, 1]: test_vec_env(vectorize_aec_env_v0(rps_v1.env(), NUM_ENVS, num_cpus=num_cpus)) test_vec_env(vectorize_aec_env_v0(mahjong_maker(), NUM_ENVS, num_cpus=num_cpus)) test_infos(vectorize_aec_env_v0(hanabi_maker(), NUM_ENVS, num_cpus=num_cpus)) test_some_done(vectorize_aec_env_v0(mahjong_maker(), NUM_ENVS, num_cpus=num_cpus)) test_vec_env(vectorize_aec_env_v0(knights_archers_zombies_v7.env(), NUM_ENVS, num_cpus=num_cpus)) test_vec_env(vectorize_aec_env_v0(simple_world_comm_v2.env(), NUM_ENVS, num_cpus=num_cpus))
def test_pettinzoo_pad_action_space(): _env = simple_world_comm_v2.env() wrapped_env = pad_action_space_v0(_env) api_test.api_test(wrapped_env) seed_test.seed_test( lambda: sticky_actions_v0(simple_world_comm_v2.env(), 0.5))
def _make_env(self): return MultiagentPettingZooEnv(simple_world_comm_v2.env(), name="simple_world_comm_v2", device='cpu')
def train(arglist): with U.single_threaded_session(): if(arglist.scenario == "Uno"): from pettingzoo.classic import uno_v1 env = uno_v1.env(opponents_hand_visible=False) elif(arglist.scenario == "Texas"): from pettingzoo.classic import texas_holdem_no_limit_v1 env = texas_holdem_no_limit_v1.env() elif(arglist.scenario == "Leduc"): from pettingzoo.classic import leduc_holdem_v1 env = leduc_holdem_v1.env() elif(arglist.scenario == "Limit"): from pettingzoo.classic import texas_holdem_v1 env = texas_holdem_v1.env() elif(arglist.scenario == "Backgammon"): from pettingzoo.classic import backgammon_v1 env = backgammon_v1.env() elif(arglist.scenario == "Adversary"): from pettingzoo.mpe import simple_adversary_v2 env = simple_adversary_v2.env(N=2, max_cycles=25) elif(arglist.scenario == "Crypto"): from pettingzoo.mpe import simple_crypto_v2 env = simple_crypto_v2.env(max_cycles=25) elif(arglist.scenario == "Spread"): from pettingzoo.mpe import simple_spread_v2 env = simple_spread_v2.env(N=3, local_ratio=0.5, max_cycles=25) elif(arglist.scenario == "SpeakerListener"): from pettingzoo.mpe import simple_speaker_listener_v3 env = simple_speaker_listener_v3.env(max_cycles=25) elif(arglist.scenario == "WorldCom"): from pettingzoo.mpe import simple_world_comm_v2 env = simple_world_comm_v2.env(num_good=2, num_adversaries=3, num_obstacles=1, num_food=2, max_cycles=25) else: print("no scenario found") assert(False) obs_shape_n = [] for player, space in env.observation_spaces.items(): val = np.product(space.shape) obs_shape_n.append(tuple((val,))) num_players = len(env.observation_spaces) print("Playing with: ", num_players, " players") num_adversaries = min(num_players, arglist.num_adversaries) # Create agent trainers trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {}'.format(arglist.good_policy)) # Initialize U.initialize() episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(num_players)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() env.reset() episode_step = 0 train_step = 0 t_start = time.time() current_agent_index = 0 num_adversaries = arglist.num_adversaries print('Starting iterations...') while True: agent = env.agents[current_agent_index] trainer = trainers[current_agent_index] player_key = env.agent_selection obs = env.observe(agent=agent).flatten() action_probability = trainer.action(obs) print("action_probability: ", action_probability) action = np.random.choice(a=np.linspace(0,len(action_probability)-1, num=len(action_probability), dtype=int), size=1, p=action_probability)[0] print(action) print(env.observe(agent).flatten()) print(env.rewards) print(env.dones) print(env.infos) obs_n = env.observe(agent).flatten() env.step(action) new_obs_n, rew_n, done_n, info_n = env.observe(agent).flatten(), env.rewards, env.dones, env.infos player_info = info_n.get(player_key) print(action) print(env.observe(agent).flatten()) print(env.rewards) print(env.dones) print(env.infos) rew_array = rew_n.values() episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # False print(action) print(env.observation_spaces) print(env.observe(agent=agent).flatten()) assert(False) # experience(self, obs, act, mask, rew, new_obs, done) trainer.experience(obs_n, action_probability, None, [rew_n.get(player_key)], new_obs_n, done_n.get(player_key)) for i, rew in enumerate(rew_array): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) if(loss is not None and agent.sleep_regimen and agent.agent_mic != 0 and train_step % 100 == 0): # Change sleep frequency here if desired original_policy_loss = loss[1] new_loss = agent.update(trainers, train_step, sleeping=True)[1] sleep_iteration = 0 while((sleep_iteration < 10) and (new_loss < original_policy_loss * 1.05)): new_loss = agent.update(trainers, train_step, sleeping=True)[1] sleep_iteration += 1 #print("sleep walking") # save model, display training output if done and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3))) else: print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: print(arglist.plots_dir) print(arglist.exp_name) rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) break current_agent_index += 1 if(current_agent_index > num_players - 1): current_agent_index = 0
dtype_v0(knights_archers_zombies_v10.env(vector_state=False), np.uint8), x_size=5, y_size=10, linear_interp=True, ), supersuit.dtype_v0(knights_archers_zombies_v10.env(), np.int32), supersuit.flatten_v0(knights_archers_zombies_v10.env()), supersuit.reshape_v0(knights_archers_zombies_v10.env(vector_state=False), (512 * 512, 3)), supersuit.normalize_obs_v0(dtype_v0(knights_archers_zombies_v10.env(), np.float32), env_min=-1, env_max=5.0), supersuit.frame_stack_v1(combined_arms_v6.env(), 8), supersuit.pad_observations_v0(simple_world_comm_v2.env()), supersuit.pad_action_space_v0(simple_world_comm_v2.env()), supersuit.black_death_v3(combined_arms_v6.env()), supersuit.agent_indicator_v0(knights_archers_zombies_v10.env(), True), supersuit.agent_indicator_v0(knights_archers_zombies_v10.env(), False), supersuit.reward_lambda_v0(knights_archers_zombies_v10.env(), lambda x: x / 10), supersuit.clip_reward_v0(combined_arms_v6.env()), supersuit.nan_noop_v0(knights_archers_zombies_v10.env(), 0), supersuit.nan_zeros_v0(knights_archers_zombies_v10.env()), supersuit.nan_random_v0(chess_v5.env()), supersuit.nan_random_v0(knights_archers_zombies_v10.env()), supersuit.frame_skip_v0(combined_arms_v6.env(), 4), supersuit.sticky_actions_v0(combined_arms_v6.env(), 0.75), supersuit.delay_observations_v0(combined_arms_v6.env(), 3), supersuit.max_observation_v0(knights_archers_zombies_v10.env(), 3),