def test_variable_spaces(self):
     env = MultiagentPettingZooEnv(simple_world_comm_v2.env(),
                                   name="simple_world_comm_v2",
                                   device='cpu')
     state = env.reset()
     # tests that action spaces work
     for agent in env.agents:
         state = env.last()
         self.assertTrue(env.observation_spaces[agent].contains(
             state['observation'].cpu().detach().numpy()))
         env.step(env.action_spaces[env.agent_selection].sample())
Ejemplo n.º 2
0
def test_all():
    NUM_ENVS = 5
    NUM_CPUS = 2

    def test_vec_env(vec_env):
        vec_env.reset()
        obs, rew, agent_done, env_done, agent_passes, infos = vec_env.last()
        print(np.asarray(obs).shape)
        assert len(obs) == NUM_ENVS
        act_space = vec_env.action_spaces[vec_env.agent_selection]
        assert np.all(np.equal(obs, vec_env.observe(vec_env.agent_selection)))
        assert len(vec_env.observe(vec_env.agent_selection)) == NUM_ENVS
        vec_env.step([act_space.sample() for _ in range(NUM_ENVS)])
        obs, rew, agent_done, env_done, agent_passes, infos = vec_env.last(observe=False)
        assert obs is None

    def test_infos(vec_env):
        vec_env.reset()
        infos = vec_env.infos[vec_env.agent_selection]
        assert infos[1]["legal_moves"]

    def test_seed(vec_env):
        vec_env.seed(4)

    def test_some_done(vec_env):
        vec_env.reset()
        act_space = vec_env.action_spaces[vec_env.agent_selection]
        assert not any(done for dones in vec_env.dones.values() for done in dones)
        vec_env.step([act_space.sample() for _ in range(NUM_ENVS)])
        assert any(done for dones in vec_env.dones.values() for done in dones)
        assert any(rew != 0 for rews in vec_env.rewards.values() for rew in rews)

    def select_action(vec_env, passes, i):
        my_info = vec_env.infos[vec_env.agent_selection][i]
        if False and not passes[i] and "legal_moves" in my_info:
            return random.choice(my_info["legal_moves"])
        else:
            act_space = vec_env.action_spaces[vec_env.agent_selection]
            return act_space.sample()

    for num_cpus in [0, 1]:
        test_vec_env(vectorize_aec_env_v0(rps_v1.env(), NUM_ENVS, num_cpus=num_cpus))
        test_vec_env(vectorize_aec_env_v0(mahjong_maker(), NUM_ENVS, num_cpus=num_cpus))
        test_infos(vectorize_aec_env_v0(hanabi_maker(), NUM_ENVS, num_cpus=num_cpus))
        test_some_done(vectorize_aec_env_v0(mahjong_maker(), NUM_ENVS, num_cpus=num_cpus))
        test_vec_env(vectorize_aec_env_v0(knights_archers_zombies_v7.env(), NUM_ENVS, num_cpus=num_cpus))
        test_vec_env(vectorize_aec_env_v0(simple_world_comm_v2.env(), NUM_ENVS, num_cpus=num_cpus))
Ejemplo n.º 3
0
def test_pettinzoo_pad_action_space():
    _env = simple_world_comm_v2.env()
    wrapped_env = pad_action_space_v0(_env)
    api_test.api_test(wrapped_env)
    seed_test.seed_test(
        lambda: sticky_actions_v0(simple_world_comm_v2.env(), 0.5))
 def _make_env(self):
     return MultiagentPettingZooEnv(simple_world_comm_v2.env(),
                                    name="simple_world_comm_v2",
                                    device='cpu')
Ejemplo n.º 5
0
def train(arglist):
    with U.single_threaded_session():
        if(arglist.scenario == "Uno"):
            from pettingzoo.classic import uno_v1
            env = uno_v1.env(opponents_hand_visible=False)
        elif(arglist.scenario == "Texas"):
            from pettingzoo.classic import texas_holdem_no_limit_v1
            env = texas_holdem_no_limit_v1.env()
        elif(arglist.scenario == "Leduc"):
            from pettingzoo.classic import leduc_holdem_v1
            env = leduc_holdem_v1.env()
        elif(arglist.scenario == "Limit"):
            from pettingzoo.classic import texas_holdem_v1
            env = texas_holdem_v1.env()
        elif(arglist.scenario == "Backgammon"):
            from pettingzoo.classic import backgammon_v1
            env = backgammon_v1.env()
        elif(arglist.scenario == "Adversary"):
            from pettingzoo.mpe import simple_adversary_v2
            env = simple_adversary_v2.env(N=2, max_cycles=25)
        elif(arglist.scenario == "Crypto"):
            from pettingzoo.mpe import simple_crypto_v2
            env = simple_crypto_v2.env(max_cycles=25)
        elif(arglist.scenario == "Spread"):
            from pettingzoo.mpe import simple_spread_v2
            env = simple_spread_v2.env(N=3, local_ratio=0.5, max_cycles=25)
        elif(arglist.scenario == "SpeakerListener"):
            from pettingzoo.mpe import simple_speaker_listener_v3
            env = simple_speaker_listener_v3.env(max_cycles=25)
        elif(arglist.scenario == "WorldCom"):   
            from pettingzoo.mpe import simple_world_comm_v2
            env = simple_world_comm_v2.env(num_good=2, num_adversaries=3, num_obstacles=1, num_food=2, max_cycles=25)
        else:
            print("no scenario found")
            assert(False)
        
        obs_shape_n = []
        for player, space in env.observation_spaces.items():
            val = np.product(space.shape)
            obs_shape_n.append(tuple((val,)))
        
        num_players = len(env.observation_spaces)
        print("Playing with: ", num_players, " players")

        num_adversaries = min(num_players, arglist.num_adversaries)
        # Create agent trainers
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {}'.format(arglist.good_policy))

        # Initialize
        U.initialize()

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0] for _ in range(num_players)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()
        current_agent_index = 0
        num_adversaries = arglist.num_adversaries

        print('Starting iterations...')
        while True:

            agent = env.agents[current_agent_index]
            trainer = trainers[current_agent_index]
            player_key = env.agent_selection
            obs = env.observe(agent=agent).flatten()
            action_probability = trainer.action(obs)

            print("action_probability: ", action_probability)
            
            
            action = np.random.choice(a=np.linspace(0,len(action_probability)-1, num=len(action_probability), dtype=int), size=1, p=action_probability)[0]
            
            print(action)
            print(env.observe(agent).flatten())
            print(env.rewards)
            print(env.dones) 
            print(env.infos)
            
            obs_n = env.observe(agent).flatten()
            env.step(action)
            new_obs_n, rew_n, done_n, info_n = env.observe(agent).flatten(), env.rewards, env.dones, env.infos
            player_info = info_n.get(player_key)

            print(action)
            print(env.observe(agent).flatten())
            print(env.rewards)
            print(env.dones) 
            print(env.infos)

            rew_array = rew_n.values()

            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len) # False 

            print(action)
            print(env.observation_spaces)
            print(env.observe(agent=agent).flatten())

            assert(False)

            # experience(self, obs, act, mask, rew, new_obs, done)
            trainer.experience(obs_n, action_probability, None, [rew_n.get(player_key)], new_obs_n, done_n.get(player_key))

            for i, rew in enumerate(rew_array):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

                if(loss is not None and agent.sleep_regimen and agent.agent_mic != 0 and train_step % 100 == 0): # Change sleep frequency here if desired
                    original_policy_loss = loss[1]
                    new_loss = agent.update(trainers, train_step, sleeping=True)[1]
                    sleep_iteration = 0
                    while((sleep_iteration < 10) and (new_loss < original_policy_loss * 1.05)):
                        new_loss = agent.update(trainers, train_step, sleeping=True)[1]
                        sleep_iteration += 1 
                        #print("sleep walking")

            # save model, display training output
            if done and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3)))
                else:
                    print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
                        [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                print(arglist.plots_dir)
                print(arglist.exp_name)

                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(len(episode_rewards)))
                break
            
            current_agent_index += 1
            if(current_agent_index > num_players - 1):
                current_agent_index = 0
     dtype_v0(knights_archers_zombies_v10.env(vector_state=False),
              np.uint8),
     x_size=5,
     y_size=10,
     linear_interp=True,
 ),
 supersuit.dtype_v0(knights_archers_zombies_v10.env(), np.int32),
 supersuit.flatten_v0(knights_archers_zombies_v10.env()),
 supersuit.reshape_v0(knights_archers_zombies_v10.env(vector_state=False),
                      (512 * 512, 3)),
 supersuit.normalize_obs_v0(dtype_v0(knights_archers_zombies_v10.env(),
                                     np.float32),
                            env_min=-1,
                            env_max=5.0),
 supersuit.frame_stack_v1(combined_arms_v6.env(), 8),
 supersuit.pad_observations_v0(simple_world_comm_v2.env()),
 supersuit.pad_action_space_v0(simple_world_comm_v2.env()),
 supersuit.black_death_v3(combined_arms_v6.env()),
 supersuit.agent_indicator_v0(knights_archers_zombies_v10.env(), True),
 supersuit.agent_indicator_v0(knights_archers_zombies_v10.env(), False),
 supersuit.reward_lambda_v0(knights_archers_zombies_v10.env(),
                            lambda x: x / 10),
 supersuit.clip_reward_v0(combined_arms_v6.env()),
 supersuit.nan_noop_v0(knights_archers_zombies_v10.env(), 0),
 supersuit.nan_zeros_v0(knights_archers_zombies_v10.env()),
 supersuit.nan_random_v0(chess_v5.env()),
 supersuit.nan_random_v0(knights_archers_zombies_v10.env()),
 supersuit.frame_skip_v0(combined_arms_v6.env(), 4),
 supersuit.sticky_actions_v0(combined_arms_v6.env(), 0.75),
 supersuit.delay_observations_v0(combined_arms_v6.env(), 3),
 supersuit.max_observation_v0(knights_archers_zombies_v10.env(), 3),