Esempio n. 1
0
    def test_pettingzoo_env(self):
        register_env("simple_spread",
                     lambda _: PettingZooEnv(simple_spread_v2.env()))
        env = PettingZooEnv(simple_spread_v2.env())
        observation_space = env.observation_space
        action_space = env.action_space
        del env

        agent_class = get_trainer_class("PPO")

        config = deepcopy(agent_class.get_default_config())

        config["multiagent"] = {
            # Set of policy IDs (by default, will use Trainer's
            # default policy class, the env's obs/act spaces and config={}).
            "policies": {
                "av": (None, observation_space, action_space, {})
            },
            # Mapping function that always returns "av" as policy ID to use
            # (for any agent).
            "policy_mapping_fn": lambda agent_id, episode, **kwargs: "av",
        }

        config["log_level"] = "DEBUG"
        config["num_workers"] = 0
        config["rollout_fragment_length"] = 30
        config["train_batch_size"] = 200
        config["horizon"] = 200  # After n steps, force reset simulation
        config["no_done_at_end"] = False

        agent = agent_class(env="simple_spread", config=config)
        agent.train()
    def test_pettingzoo_env(self):
        register_env("simple_spread",
                     lambda _: PettingZooEnv(simple_spread_v2.env()))

        agent_class = get_agent_class("PPO")

        config = deepcopy(agent_class._default_config)

        test_env = PettingZooEnv(simple_spread_v2.env())
        obs_space = test_env.observation_space
        act_space = test_env.action_space
        test_env.close()

        config["multiagent"] = {
            "policies": {
                # the first tuple value is None -> uses default policy
                "av": (None, obs_space, act_space, {}),
            },
            "policy_mapping_fn": lambda agent_id: "av"
        }

        config["log_level"] = "DEBUG"
        config["num_workers"] = 0
        config["rollout_fragment_length"] = 30
        config["train_batch_size"] = 200
        config["horizon"] = 200  # After n steps, force reset simulation
        config["no_done_at_end"] = False

        agent = agent_class(env="simple_spread", config=config)
        agent.train()
Esempio n. 3
0
 def __init__(self, name, max_step=None):
     if name == "simple_spread":
         self.env = simple_spread_v2.env()
     elif name == "waterworld":
         self.env = waterworld_v3.env()
     elif name == "multiwalker":
         self.env = multiwalker_v6.env()
     else:
         assert AssertionError, "wrong env name."
     self.max_step = max_step
     self.curr_step = 0
     self.name = name
     self.agents = self.env.possible_agents
     self.env.reset()
Esempio n. 4
0
def train(arglist):
    with U.single_threaded_session():
        if(arglist.scenario == "Uno"):
            from pettingzoo.classic import uno_v1
            env = uno_v1.env(opponents_hand_visible=False)
        elif(arglist.scenario == "Texas"):
            from pettingzoo.classic import texas_holdem_no_limit_v1
            env = texas_holdem_no_limit_v1.env()
        elif(arglist.scenario == "Leduc"):
            from pettingzoo.classic import leduc_holdem_v1
            env = leduc_holdem_v1.env()
        elif(arglist.scenario == "Limit"):
            from pettingzoo.classic import texas_holdem_v1
            env = texas_holdem_v1.env()
        elif(arglist.scenario == "Backgammon"):
            from pettingzoo.classic import backgammon_v1
            env = backgammon_v1.env()
        elif(arglist.scenario == "Adversary"):
            from pettingzoo.mpe import simple_adversary_v2
            env = simple_adversary_v2.env(N=2, max_cycles=25)
        elif(arglist.scenario == "Crypto"):
            from pettingzoo.mpe import simple_crypto_v2
            env = simple_crypto_v2.env(max_cycles=25)
        elif(arglist.scenario == "Spread"):
            from pettingzoo.mpe import simple_spread_v2
            env = simple_spread_v2.env(N=3, local_ratio=0.5, max_cycles=25)
        elif(arglist.scenario == "SpeakerListener"):
            from pettingzoo.mpe import simple_speaker_listener_v3
            env = simple_speaker_listener_v3.env(max_cycles=25)
        elif(arglist.scenario == "WorldCom"):   
            from pettingzoo.mpe import simple_world_comm_v2
            env = simple_world_comm_v2.env(num_good=2, num_adversaries=3, num_obstacles=1, num_food=2, max_cycles=25)
        else:
            print("no scenario found")
            assert(False)
        
        obs_shape_n = []
        for player, space in env.observation_spaces.items():
            val = np.product(space.shape)
            obs_shape_n.append(tuple((val,)))
        
        num_players = len(env.observation_spaces)
        print("Playing with: ", num_players, " players")

        num_adversaries = min(num_players, arglist.num_adversaries)
        # Create agent trainers
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {}'.format(arglist.good_policy))

        # Initialize
        U.initialize()

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0] for _ in range(num_players)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()
        current_agent_index = 0
        num_adversaries = arglist.num_adversaries

        print('Starting iterations...')
        while True:

            agent = env.agents[current_agent_index]
            trainer = trainers[current_agent_index]
            player_key = env.agent_selection
            obs = env.observe(agent=agent).flatten()
            action_probability = trainer.action(obs)

            print("action_probability: ", action_probability)
            
            
            action = np.random.choice(a=np.linspace(0,len(action_probability)-1, num=len(action_probability), dtype=int), size=1, p=action_probability)[0]
            
            print(action)
            print(env.observe(agent).flatten())
            print(env.rewards)
            print(env.dones) 
            print(env.infos)
            
            obs_n = env.observe(agent).flatten()
            env.step(action)
            new_obs_n, rew_n, done_n, info_n = env.observe(agent).flatten(), env.rewards, env.dones, env.infos
            player_info = info_n.get(player_key)

            print(action)
            print(env.observe(agent).flatten())
            print(env.rewards)
            print(env.dones) 
            print(env.infos)

            rew_array = rew_n.values()

            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len) # False 

            print(action)
            print(env.observation_spaces)
            print(env.observe(agent=agent).flatten())

            assert(False)

            # experience(self, obs, act, mask, rew, new_obs, done)
            trainer.experience(obs_n, action_probability, None, [rew_n.get(player_key)], new_obs_n, done_n.get(player_key))

            for i, rew in enumerate(rew_array):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)

                if(loss is not None and agent.sleep_regimen and agent.agent_mic != 0 and train_step % 100 == 0): # Change sleep frequency here if desired
                    original_policy_loss = loss[1]
                    new_loss = agent.update(trainers, train_step, sleeping=True)[1]
                    sleep_iteration = 0
                    while((sleep_iteration < 10) and (new_loss < original_policy_loss * 1.05)):
                        new_loss = agent.update(trainers, train_step, sleeping=True)[1]
                        sleep_iteration += 1 
                        #print("sleep walking")

            # save model, display training output
            if done and (len(episode_rewards) % arglist.save_rate == 0):
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3)))
                else:
                    print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
                        [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                print(arglist.plots_dir)
                print(arglist.exp_name)

                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(len(episode_rewards)))
                break
            
            current_agent_index += 1
            if(current_agent_index > num_players - 1):
                current_agent_index = 0