Esempio n. 1
0
    def __init__(self, env_config=None):

        self.agent_list = [HoldAgent(), agents.SimpleAgent(), HoldAgent(), HoldAgent()]
        # self.agent_list = [agents.SimpleAgent(), agents.SimpleAgent(), agents.SimpleAgent(), agents.RandomAgent()]
        self.all_obs = None
        self.all_action = None
        self.cur_obs = None
        self.alive_agents = [10, 11, 12, 13]
        self.player_agent_id = 10
        self.total_reward = 0

        pomme_config = pommerman.configs.ffa_competition_env()

        if env_config:
            for k, v in env_config.items():
                if k in pomme_config['env_kwargs']:
                    pomme_config['env_kwargs'][k] = v

        self.pomme = Pomme(**pomme_config['env_kwargs'])

        self.observation_space = self.init_observation_space(pomme_config['env_kwargs'])
        self.action_space = self.pomme.action_space

        if not env_config or (env_config and env_config.get("is_training", True)):
            # initialize env twice could raise error here.
            self.init(pomme_config)
    def __init__(self, env_config=None):

        pomme_config = pommerman.configs.ffa_competition_env()

        if env_config:
            for k, v in env_config.items():
                if k in pomme_config['env_kwargs']:
                    pomme_config['env_kwargs'][k] = v

        print("pomme_config: ")
        print(pomme_config['env_kwargs'])

        self.pomme = Pomme(**pomme_config['env_kwargs'])

        self.observation_space = self.init_observation_space(
            pomme_config['env_kwargs'])
        self.action_space = self.pomme.action_space

        self.total_reward = 0
        self.prev_alive = 4
        self.visited = np.zeros(shape=(11, 11))

        if not env_config or (env_config
                              and env_config.get("is_training", True)):
            # initialize env twice could raise error here.
            self.init(pomme_config)
Esempio n. 3
0
    def __init__(self, env_config={}):
        pomme_config = pommerman.configs.ffa_competition_env()
        self.reward = Reward(env_config.get("reward"))

        self.pomme = Pomme(**pomme_config['env_kwargs'])

        self.observation_space = self.init_observation_space(
            pomme_config['env_kwargs'])
        self.action_space = self.pomme.action_space

        if not env_config or (env_config
                              and env_config.get("is_training", True)):
            # initialize env twice could raise error here.
            self.init(pomme_config)
def evaluate_agent(agent,
                   config,
                   selected_labels=None,
                   agent_id=0,
                   iterations=100,
                   plot=True):
    # Instantiate the environment
    env = Pomme(**config["env_kwargs"])
    info = []
    rewards = np.zeros((iterations, 4))
    lengths = np.zeros((iterations, 4))

    if isinstance(agent, EvaluatorAgent):
        agent.reset_run()
    start_time = time.time()
    for i in tqdm(range(iterations)):
        # print('{}/{}'.format(i+1, iterations), end='\r')
        info_ep, reward, lens = run_episode(agent, config, env, agent_id)
        info.append(info_ep)
        rewards[i] = reward
        lengths[i] = lens
        if isinstance(agent, EvaluatorAgent):
            agent.end_episode()

    if plot:
        plot_statistics(agent, info, selected_labels, agent_id, iterations)
    elapsed = time.time() - start_time
    return info, rewards, lengths, elapsed
Esempio n. 5
0
    def env_for_players(self):
        config = ffa_v0_fast_env(30)
        env = Pomme(**config["env_kwargs"])
        agents = [DQN(config["agent"](0, config["game_type"])),
                  PlayerAgent(config["agent"](1, config["game_type"])),
                  RandomAgent(config["agent"](2, config["game_type"])),
                  RandomAgent(config["agent"](3, config["game_type"]))]
        env.set_agents(agents)
        env.set_training_agent(agents[0].agent_id)  # training_agent is only dqn agent
        env.set_init_game_state(None)

        return env
Esempio n. 6
0
def ffa_evaluate(env: Pomme, episodes, verbose, visualize, stop=False):
    """
    Evaluates the given pommerman environment (already includes the agents).

    :param episodes: The number of episodes
    :param verbose: Whether to print verbose status information
    :param visualize: Whether to visualize the execution
    :param stop: Whether to wait for input after each step
    :return: The results of the evaluation of shape (episodes, 5) where the first column [:, 0] contains the result
             of the match (tie, win, incomplete) and the remaining columns contain the individual (final) rewards.
    """

    # first element: result, additional elements: rewards
    steps = np.empty(episodes)
    results = np.empty((episodes, 1 + 4))

    start = time.time()

    # Run the episodes just like OpenAI Gym
    for i_episode in range(episodes):
        state = env.reset()
        done = False
        reward = []
        info = {}
        step = 0
        while not done:
            if visualize:
                env.render()
            actions = env.act(state)
            state, reward, done, info = env.step(actions)
            step += 1

            if stop:
                input()

        steps[i_episode] = step

        result = info['result']
        # save the result
        results[i_episode, 0] = result.value
        results[i_episode, 1:] = reward

        if verbose:
            delta = time.time() - start
            print('\r{:.2f} sec > Episode {} finished with {} ({})'.format(
                delta, i_episode, result, reward))

            if i_episode % 10 == 9 and i_episode != episodes - 1:
                ffa_print_stats(results, steps, i_episode + 1)

    env.close()

    if verbose:
        delta = time.time() - start
        print("Total time: {:.2f} sec".format(delta))
        ffa_print_stats(results, steps, episodes)

    return results
def set_pommerman_env(agent_id=0):
    # Instantiate the environment
    config = ffa_v0_fast_env()
    env = Pomme(**config["env_kwargs"])

    np.random.seed(0)
    env.seed(0)
    # Add 3 Simple Agents and 1 DQN agent
    agents = [
        DQN(config["agent"](agent_id, config["game_type"])) if i == agent_id
        else SimpleAgent(config["agent"](i, config["game_type"]))
        for i in range(4)
    ]
    env.set_agents(agents)
    env.set_training_agent(
        agents[agent_id].agent_id)  # training_agent is only dqn agent
    env.set_init_game_state(None)

    return env
Esempio n. 8
0
    def __init__(self, config=pommerman_cfg.team_competition_env()):
        '''
        Initializes the Pommerman environment and adds Dummy Agents as expected by `Pomme`.

        Args:
            config (dict): A config defining the game mode. Options include FFA mode, team (2v2) and team radio (2v2).
            See pommerman's config.py and docs for more details.
        '''
        self.pomme = Pomme(**config['env_kwargs'])
        self.observation_space = dict
        self.action_space = self.pomme.action_space
        self.agent_names = AGENT_IDS
        agent_list = []
        for i in range(4):
            agent_id = i
            agent_list.append(
                agents.BaseAgent(config["agent"](agent_id,
                                                 config["game_type"])))
        self.pomme.set_agents(agent_list)
        self.pomme.set_init_game_state(None)
Esempio n. 9
0
def makeTrainingObservation():
    env = Pomme(**config["env_kwargs"])
    agents = {}
    for agent_id in range(num_players):
        agent = TrainingAgent(config["agent"](agent_id, config["game_type"]))              
        agents[agent_id] = agent
    env.set_agents(list(agents.values()))
    env.set_init_game_state(None)
    return env
Esempio n. 10
0
    def __init__(self, env_config=None):

        pomme_config = pommerman.configs.ffa_competition_env()

        if env_config:
            for k, v in env_config.items():
                if k in pomme_config['env_kwargs']:
                    pomme_config['env_kwargs'][k] = v
            self.reward = Reward(env_config.get("reward"))
        else:
            self.reward = Reward()

        print("Pommerman Config:", pomme_config['env_kwargs'])

        self.pomme = Pomme(**pomme_config['env_kwargs'])

        self.observation_space = self.init_observation_space(
            pomme_config['env_kwargs'])
        self.action_space = self.pomme.action_space

        if not env_config or (env_config
                              and env_config.get("is_training", True)):
            # initialize env twice could raise error here.
            self.init(pomme_config)
def get_env():
    config = ffa_v0_fast_env()
    env = Pomme(**config["env_kwargs"])

    agent_id = 0

    agents = [
        DQN(config["agent"](0, config["game_type"])),
        SimpleAgent(config["agent"](1, config["game_type"])),
        SimpleAgent(config["agent"](2, config["game_type"])),
        SimpleAgent(config["agent"](3, config["game_type"])),
    ]

    env.set_agents(agents)

    env.set_training_agent(agents[agent_id].agent_id)
    env.set_init_game_state(None)

    return env
 def _thunk():
     env = Pomme(**config["env_kwargs"])
     agents = {}
     for agent_id in range(num_players):
         agent = TrainingAgent(config["agent"](agent_id,
                                               config["game_type"]))
         agents[agent_id] = agent
     simple_Agent_id = num_players
     agents[simple_Agent_id] = SimpleAgent(config["agent"](
         simple_Agent_id, config["game_type"]))
     env.set_agents(list(agents.values()))
     env.set_init_game_state(None)
     return env
Esempio n. 13
0
 def make_env(self, config):
     # Instantiate the environment
     env = Pomme(**config["env_kwargs"])
     # Add agents
     agents = []
     for agent_id in range(NUM_AGENTS):
         if agent_id == self.agent_id:
             agents.append(self)
         else:
             agents.append(
                 SimpleAgent(config["agent"](agent_id,
                                             config["game_type"])))
     env.set_agents(agents)
     env.set_init_game_state(None)
     return env
Esempio n. 14
0
def main(args):
    version = 'v1'
    episodes = args.episodes
    visualize = args.visualize

    config = ffa_v0_fast_env()
    env = Pomme(**config["env_kwargs"])
    env.seed(0)

    agent = PPOAgent(
        states=dict(type='float', shape=(11, 11, 12)),
        actions=dict(type='int', num_actions=env.action_space.n),
        network=[
            # (9, 9, 12)
            dict(type='conv2d', size=12, window=3, stride=1),
            # (7, 7, 8)
            dict(type='conv2d', size=8, window=3, stride=1),
            # (5, 5, 4)
            dict(type='conv2d', size=4, window=3, stride=1),
            # (100)
            dict(type='flatten'),
            dict(type='dense', size=64, activation='relu'),
            dict(type='dense', size=16, activation='relu'),
        ],
        batching_capacity=1000,
        step_optimizer=dict(type='adam', learning_rate=1e-4))

    if os.path.exists(os.path.join('models', version, 'checkpoint')):
        agent.restore_model(directory=os.path.join('models', version))

    agents = []
    for agent_id in range(3):
        # agents.append(RandomAgent(config["agent"](agent_id, config["game_type"])))
        # agents.append(StoppingAgent(config["agent"](agent_id, config["game_type"])))
        agents.append(
            SimpleAgent(config["agent"](agent_id, config["game_type"])))

    agent_id += 1
    agents.append(
        TensorforceAgent(config["agent"](agent_id, config["game_type"])))
    env.set_agents(agents)
    env.set_training_agent(agents[-1].agent_id)
    env.set_init_game_state(None)

    wrapped_env = WrappedEnv(env, agent, visualize)
    runner = Runner(agent=agent, environment=wrapped_env)

    try:
        runner.run(episodes=episodes, max_episode_timesteps=100)
    except Exception as e:
        raise e
    finally:
        agent.save_model(directory=os.path.join('models', version, 'agent'))

    win_count = len(
        list(filter(lambda reward: reward == 1, runner.episode_rewards)))
    print('Stats: ')
    print(f'  runner.episode_rewards = {runner.episode_rewards}')
    print(f'  win count = {win_count}')

    try:
        runner.close()
    except AttributeError as e:
        raise e
class PomFFA(gym.Env):
    agent_list = [HoldAgent(), HoldAgent(), HoldAgent(), HoldAgent()]
    all_obs = None
    all_action = None
    cur_obs = None
    alive_agents = [10, 11, 12, 13]
    player_agent_id = 10

    def __init__(self, env_config=None):

        pomme_config = pommerman.configs.ffa_competition_env()

        if env_config:
            for k, v in env_config.items():
                if k in pomme_config['env_kwargs']:
                    pomme_config['env_kwargs'][k] = v

        print("pomme_config: ")
        print(pomme_config['env_kwargs'])

        self.pomme = Pomme(**pomme_config['env_kwargs'])

        self.observation_space = self.init_observation_space(
            pomme_config['env_kwargs'])
        self.action_space = self.pomme.action_space

        self.total_reward = 0
        self.prev_alive = 4
        self.visited = np.zeros(shape=(11, 11))

        if not env_config or (env_config
                              and env_config.get("is_training", True)):
            # initialize env twice could raise error here.
            self.init(pomme_config)

    def init(self, pomm_config):
        for id_, agent in enumerate(self.agent_list):
            assert isinstance(agent, agents.BaseAgent)
            print(id_, pomm_config['game_type'])
            agent.init_agent(id_, pomm_config['game_type'])
        self.pomme.set_agents(self.agent_list)
        self.pomme.set_init_game_state(None)

    def reset(self):
        obs = self.pomme.reset()
        self.all_obs = obs
        obs = self.get_for_training_agent(obs)
        self.cur_obs = obs
        obs = self.preproess(obs)
        self.total_reward = 0
        self.prev_alive = 4
        self.visited = np.zeros(shape=(11, 11))
        return obs

    def get_reward(self, obs, action, agent_id):
        if len(obs["alive"]) == 1:
            # An agent won. Give them +1, others -1.
            if agent_id in obs['alive']:
                return 1.0 - self.total_reward
            else:
                return -0.5

        if obs["step_count"] >= 500:
            # Game is over from time. Everyone gets -1.
            return -0.5

        # Game running: 0 for alive, -1 for dead.
        if agent_id not in obs['alive']:
            return -0.5

        x, y = obs["position"]
        blast = obs["bomb_blast_strength"]

        px = [1, -1, 0, 0]
        py = [0, 0, -1, 1]

        sum_reward = 0.0

        sum_reward += 20 * (len(obs["alive"]) - self.prev_alive)
        self.prev_alive = len(obs["alive"])

        if action == 0:
            sum_reward -= 0.1

        elif action == 5:
            # sum_reward += 1
            for i in range(4):
                tx = x + px[i]
                ty = y + py[i]
                if tx < 0 or tx > 10 or ty < 0 or ty > 10:
                    continue
                if obs["board"][tx][ty] == 1:
                    sum_reward += 2
                elif obs["board"][tx][ty] > 10:
                    sum_reward += 4
        else:
            assert (1 <= action <= 4), str(action)
            dx = x + px[action - 1]
            dy = y + py[action - 1]
            if (not (dx < 0 or dx > 10 or dy < 0
                     or dy > 10)) and obs["board"][dx][dy] == 0:
                if self.visited[dx][dy] > 0:
                    sum_reward -= 0.1
                else:
                    sum_reward += 0.3
                    self.visited[dx][dy] = 1

        sum_reward = sum_reward * 1.0 / 100.0
        new_total_reward = self.total_reward + sum_reward
        if new_total_reward > 0.8 or new_total_reward < -0.5:
            sum_reward = 0.0
        else:
            self.total_reward = new_total_reward

        return sum_reward

    def step(self, action):
        actions = self.pomme.act(self.all_obs)
        if self.alive_agents and self.player_agent_id in self.alive_agents:
            actions = self.set_for_training_agent(actions, action)
        else:
            actions = self.set_for_training_agent(actions, 0)
        obs, rewards, done, info = self.pomme.step(actions)

        self.all_obs = obs
        obs = self.get_for_training_agent(obs)
        self.cur_obs = obs
        reward = self.get_reward(self.cur_obs, action, self.player_agent_id)
        self.alive_agents = obs['alive']
        if (self.player_agent_id
                not in self.alive_agents) or obs["step_count"] >= 500:
            done = True
        obs = self.preproess(obs)
        return obs, reward, done, {}

    def get_for_training_agent(self, inputs):
        order = self.player_agent_id - 10
        return inputs[order].copy()

    def set_for_training_agent(self, inputs, value):
        order = self.player_agent_id - 10
        inputs[order] = value
        return inputs

    def init_observation_space(self, env_config):
        """
            observations for agents
            board: n^2
            bomb blast strength: n^2
            bomb life: n^2
        """
        board_size = env_config['board_size'] or 11
        num_items = env_config['num_items'] or 11
        print("env config: {}".format(env_config))
        # board_size = 11

        board = spaces.Box(low=0,
                           high=len(constants.Item),
                           shape=(board_size, board_size))
        danger = spaces.Box(low=0, high=20, shape=(board_size, board_size))
        bomb_blast_strength = spaces.Box(low=0,
                                         high=num_items,
                                         shape=(board_size, board_size))
        bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size))
        flame_life = spaces.Box(low=0, high=10, shape=(board_size, board_size))
        position = spaces.Box(low=0, high=board_size, shape=(2, ))
        blast_strength = spaces.Box(low=1, high=num_items, shape=(1, ))
        ammo = spaces.Box(low=0, high=num_items, shape=(1, ))
        # return spaces.Dict({"board": board,
        #                     "bomb_blast_strength": bomb_blast_strength, "bomb_life": bomb_life,
        #                     "flame_life": flame_life,
        #                     "position": position, "ammo": ammo, "blast_strength": blast_strength})
        return spaces.Dict({
            "board": board,
            "bomb_blast_strength": bomb_blast_strength,
            "bomb_life": bomb_life,
            "flame_life": flame_life,
            "position": position,
            "ammo": ammo,
            "blast_strength": blast_strength,
            "danger": danger
        })

    @staticmethod
    def preproess(obs):
        del obs["game_type"]
        del obs["game_env"]
        del obs["can_kick"]
        del obs["teammate"]
        del obs["enemies"]
        del obs["step_count"]
        del obs['alive']
        del obs['bomb_moving_direction']
        obs['position'] = np.array(obs['position'])
        obs['ammo'] = np.array([obs['ammo']])
        obs['blast_strength'] = np.array([obs['blast_strength']])

        board = obs['board']
        bomb_blast_strength = obs['bomb_blast_strength']
        bomb_life = obs['bomb_life']
        # flame_life = obs['flame_life']
        # position = obs['position']
        # ammo = obs['ammo']
        # blast_strength = obs['blast_strength']

        danger = np.ndarray(shape=(11, 11), dtype=int)

        for x in range(11):
            for y in range(11):
                danger[x][y] = 10
                if board[x][y] == 4:
                    board[x][y] = 0
                    danger[x][y] = 0
                elif board[x][y] == 3:
                    board[x][y] = 0
                elif board[x][y] == 10:
                    board[x][y] = 1
                elif board[x][y] > 10:
                    board[x][y] = 5
                elif 6 <= board[x][y] <= 8:
                    board[x][y] = 3
                elif board[x][y] == 1:
                    board[x][y] = 4

        for x in range(11):
            for y in range(11):
                if bomb_life[x][y] > 0:
                    strength = int(bomb_blast_strength[x][y] + 0.5)
                    for tx in range(max(0, x - strength + 1),
                                    min(11, x + strength)):
                        danger[tx][y] = min(danger[tx][y], bomb_life[x][y])
                    for ty in range(max(0, y - strength + 1),
                                    min(11, y + strength)):
                        danger[x][ty] = min(danger[x][ty], bomb_life[x][y])

        obs['danger'] = danger

        return obs

    def render(self):
        self.pomme.render()
Esempio n. 16
0
    def setup(self):
        agents = []
        if self.phase == 0:
            self.agents_index = [1, 3]
            self.enemies_agents_index = [0, 2]
            config = team_v0_fast_env()
            config["env_kwargs"]["num_wood"] = 2
            config["env_kwargs"]["num_items"] = 2
            config["env_kwargs"]["num_rigid"] = 20
            agents.insert(
                0, SuicidalAgent(config["agent"](0, config["game_type"])))
            agents.insert(2, NoDoAgent(config["agent"](2,
                                                       config["game_type"])))
            print(config["env_kwargs"])
            self.env = Pomme(**config["env_kwargs"])
            self.env.seed()

        if self.phase == 1:
            self.agents_index = [1, 3]
            self.enemies_agents_index = [0, 2]
            config = team_v0_fast_env()
            config["env_kwargs"]["num_wood"] = 2
            config["env_kwargs"]["num_items"] = 2
            config["env_kwargs"]["num_rigid"] = 36
            agents.insert(
                0, SuicidalAgent(config["agent"](0, config["game_type"])))
            agents.insert(2, NoDoAgent(config["agent"](2,
                                                       config["game_type"])))
            print(config["env_kwargs"])
            self.env = Pomme(**config["env_kwargs"])
            self.env.seed()

        if self.phase == 2:
            self.agents_index = [1, 3]
            self.enemies_agents_index = [0, 2]
            config = team_v0_fast_env()
            config["env_kwargs"]["num_wood"] = 2
            config["env_kwargs"]["num_items"] = 2
            config["env_kwargs"]["num_rigid"] = 36
            agents.insert(0, NoDoAgent(config["agent"](0,
                                                       config["game_type"])))
            agents.insert(2, NoDoAgent(config["agent"](2,
                                                       config["game_type"])))
            print(config["env_kwargs"])
            self.env = Pomme(**config["env_kwargs"])
            self.env.seed()

        if self.phase == 3:
            self.agents_index = [1, 3]
            self.enemies_agents_index = [0, 2]
            config = team_v0_fast_env()
            config["env_kwargs"]["num_wood"] = 2
            config["env_kwargs"]["num_items"] = 2
            config["env_kwargs"]["num_rigid"] = 36
            agents.insert(0, NoDoAgent(config["agent"](0,
                                                       config["game_type"])))
            agents.insert(2, NoDoAgent(config["agent"](2,
                                                       config["game_type"])))
            print(config["env_kwargs"])
            self.env = Pomme(**config["env_kwargs"])
            self.env.seed()

        if self.phase == 4:
            self.agents_index = [1, 3]
            self.enemies_agents_index = [0, 2]
            config = team_v0_fast_env()
            config["env_kwargs"]["num_wood"] = 0
            config["env_kwargs"]["num_items"] = 10
            config["env_kwargs"]["num_rigid"] = 36
            agents.insert(
                0, SuicidalAgent(config["agent"](0, config["game_type"])))
            agents.insert(2,
                          SimpleAgent(config["agent"](2, config["game_type"])))
            print(config["env_kwargs"])
            self.env = Pomme(**config["env_kwargs"])
            self.env.seed()

        for agent_id in self.agents_index:
            agents.insert(
                agent_id,
                BaseLineAgent(config["agent"](agent_id, config["game_type"])))

        self.env.set_agents(agents)
        self.env.set_init_game_state(None)
        self.observation_space = spaces.Dict({
            "boards":
            spaces.Box(low=-1, high=20, shape=(3, 11, 11)),
            "states":
            spaces.Box(low=-1, high=20, shape=(9, )),
        })

        spaces.Box(low=-1.0, high=20.0, shape=(372, ), dtype=np.float32)
        self.action_space = self.env.action_space
Esempio n. 17
0
class MultiAgent(MultiAgentEnv):
    def __init__(self):
        super(MultiAgent, self).__init__()
        self.phase = 0
        self.setup()

    def setup(self):
        agents = []
        if self.phase == 0:
            self.agents_index = [1, 3]
            self.enemies_agents_index = [0, 2]
            config = team_v0_fast_env()
            config["env_kwargs"]["num_wood"] = 2
            config["env_kwargs"]["num_items"] = 2
            config["env_kwargs"]["num_rigid"] = 20
            agents.insert(
                0, SuicidalAgent(config["agent"](0, config["game_type"])))
            agents.insert(2, NoDoAgent(config["agent"](2,
                                                       config["game_type"])))
            print(config["env_kwargs"])
            self.env = Pomme(**config["env_kwargs"])
            self.env.seed()

        if self.phase == 1:
            self.agents_index = [1, 3]
            self.enemies_agents_index = [0, 2]
            config = team_v0_fast_env()
            config["env_kwargs"]["num_wood"] = 2
            config["env_kwargs"]["num_items"] = 2
            config["env_kwargs"]["num_rigid"] = 36
            agents.insert(
                0, SuicidalAgent(config["agent"](0, config["game_type"])))
            agents.insert(2, NoDoAgent(config["agent"](2,
                                                       config["game_type"])))
            print(config["env_kwargs"])
            self.env = Pomme(**config["env_kwargs"])
            self.env.seed()

        if self.phase == 2:
            self.agents_index = [1, 3]
            self.enemies_agents_index = [0, 2]
            config = team_v0_fast_env()
            config["env_kwargs"]["num_wood"] = 2
            config["env_kwargs"]["num_items"] = 2
            config["env_kwargs"]["num_rigid"] = 36
            agents.insert(0, NoDoAgent(config["agent"](0,
                                                       config["game_type"])))
            agents.insert(2, NoDoAgent(config["agent"](2,
                                                       config["game_type"])))
            print(config["env_kwargs"])
            self.env = Pomme(**config["env_kwargs"])
            self.env.seed()

        if self.phase == 3:
            self.agents_index = [1, 3]
            self.enemies_agents_index = [0, 2]
            config = team_v0_fast_env()
            config["env_kwargs"]["num_wood"] = 2
            config["env_kwargs"]["num_items"] = 2
            config["env_kwargs"]["num_rigid"] = 36
            agents.insert(0, NoDoAgent(config["agent"](0,
                                                       config["game_type"])))
            agents.insert(2, NoDoAgent(config["agent"](2,
                                                       config["game_type"])))
            print(config["env_kwargs"])
            self.env = Pomme(**config["env_kwargs"])
            self.env.seed()

        if self.phase == 4:
            self.agents_index = [1, 3]
            self.enemies_agents_index = [0, 2]
            config = team_v0_fast_env()
            config["env_kwargs"]["num_wood"] = 0
            config["env_kwargs"]["num_items"] = 10
            config["env_kwargs"]["num_rigid"] = 36
            agents.insert(
                0, SuicidalAgent(config["agent"](0, config["game_type"])))
            agents.insert(2,
                          SimpleAgent(config["agent"](2, config["game_type"])))
            print(config["env_kwargs"])
            self.env = Pomme(**config["env_kwargs"])
            self.env.seed()

        for agent_id in self.agents_index:
            agents.insert(
                agent_id,
                BaseLineAgent(config["agent"](agent_id, config["game_type"])))

        self.env.set_agents(agents)
        self.env.set_init_game_state(None)
        self.observation_space = spaces.Dict({
            "boards":
            spaces.Box(low=-1, high=20, shape=(3, 11, 11)),
            "states":
            spaces.Box(low=-1, high=20, shape=(9, )),
        })

        spaces.Box(low=-1.0, high=20.0, shape=(372, ), dtype=np.float32)
        self.action_space = self.env.action_space

    def set_phase(self, phase):
        print("learn phase " + str(phase))
        self.phase = phase
        self.setup()
        self.reset()

    def step(self, actions):
        obs = self.env.get_observations()
        all_actions = self.env.act(obs)
        for index in self.agents_index:
            try:
                action = actions[index]
            except:
                action = 0
            all_actions[index] = action

        step_obs = self.env.step(all_actions)
        obs, rew, done, info = {}, {}, {}, {}
        for i in actions.keys():
            obs[i], rew[i], done[i], info[i] = [
                featurize(step_obs[0][i]),
                step_obs[1][i],
                step_obs[1][i] == -1 or step_obs[2],
                step_obs[3],
            ]

        done["__all__"] = step_obs[2]
        return obs, rew, done, info

    def reset(self):
        obs = self.env.reset()
        return {i: featurize(obs[i]) for i in self.agents_index}
Esempio n. 18
0
class Pomme_v0(MultiAgentEnv):
    '''
    A wrapped Pommerman v0 environment for usage with Ray RLlib. The v0 environment is the base environment used in
    the NIPS'18 competition. Contrary to v1 it doesn't collapse walls and also doesn't allow for radio communication
    between agents (as does v2).

    Agents are identified by (string) agent IDs: `AGENT_IDS`
    (Note that these "agents" here are not to be confused with RLlib agents.)
    '''
    def __init__(self, config=pommerman_cfg.team_competition_env()):
        '''
        Initializes the Pommerman environment and adds Dummy Agents as expected by `Pomme`.

        Args:
            config (dict): A config defining the game mode. Options include FFA mode, team (2v2) and team radio (2v2).
            See pommerman's config.py and docs for more details.
        '''
        self.pomme = Pomme(**config['env_kwargs'])
        self.observation_space = dict
        self.action_space = self.pomme.action_space
        self.agent_names = AGENT_IDS
        agent_list = []
        for i in range(4):
            agent_id = i
            agent_list.append(
                agents.BaseAgent(config["agent"](agent_id,
                                                 config["game_type"])))
        self.pomme.set_agents(agent_list)
        self.pomme.set_init_game_state(None)

    def reset(self):
        """
        Resets the env and returns observations from ready agents.

        Returns:
            obs (dict): New observations for each ready agent.
        """
        obs_list = self.pomme.reset()
        #return {key: featurize(val) for key, val in to_dict(obs_list).items()}
        return {key: val for key, val in to_dict(obs_list).items()}

    def step(self, action_dict):
        """
        Returns observations from ready agents.
        The returns are dicts mapping from agent_id strings to values. The number of agents in the env can vary over
        time.

        Returns:
            obs (dict): New observations for each ready agent.
            rewards (dict): Reward values for each ready agent. If the episode is just started, the value will be zero.
            dones (dict): Done values for each ready agent. The key "__all__" is used to indicate the end of the game.
            infos (dict): Info values for each ready agent.
        """
        # default actions since Pommerman env expects actions even if agent is dead
        actions = {'agent_0': 0, 'agent_1': 0, 'agent_2': 0, 'agent_3': 0}
        # update actions with the ones returned from the policies
        actions.update(action_dict)
        # perform env step (expects a list)
        obs, rewards, done, info = self.pomme.step(list(actions.values()))
        # build 'dones' dictionary, key __all__ indicates env termination
        dones = {'__all__': done}
        # fetch all
        done_agents = to_dict(
            [not agent.is_alive for agent in self.pomme._agents])
        # filter done dictionary to only return agents which are still alive
        # -> apparently this is how rllib determines when agents "die"
        dones.update({key: val for key, val in done_agents.items() if not val})
        # turn info dict into dictionary with agent IDs as keys
        infos = {
            AGENT_IDS[i]: {info_k: info_v
                           for info_k, info_v in info.items()}
            for i in range(NUM_PLAYERS)
        }
        return to_dict(obs), to_dict(rewards), dones, infos
Esempio n. 19
0
def main():
    # Print all possible environments in the Pommerman registry
    # Instantiate the environment
    DETERMINISTIC = False
    VISUALIZE = False

    if args.test:
        DETERMINISTIC = True
        VISUALIZE = True

    config = ffa_competition_env()
    env = Pomme(**config["env_kwargs"])
    env.seed(0)

    # Create a Proximal Policy Optimization agent
    with open('ppo.json', 'r') as fp:
            agent = json.load(fp=fp)

    with open('mlp2_lstm_network.json', 'r') as fp:
            network = json.load(fp=fp)

    agent = Agent.from_spec(
        spec=agent,
        kwargs=dict(
            states=dict(type='float', shape=env.observation_space.shape),
            actions=dict(type='int', num_actions=env.action_space.n),
            network=network
        )
    )

    # Add 3 random agents
    agents = []
    for agent_id in range(3):
        agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"])))

    # Add TensorforceAgent
    agent_id += 1
    agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"])))
    env.set_agents(agents)
    env.set_training_agent(agents[-1].agent_id)
    env.set_init_game_state(None)

    # Instantiate and run the environment for 5 episodes.
    if VISUALIZE:
        wrapped_env = WrappedEnv(env, True)
    else:
        wrapped_env = WrappedEnv(env)

    runner = Runner(agent=agent, environment=wrapped_env)

    rewards = []
    episodes = []
    def episode_finished(r):
        nonlocal episodes
        nonlocal rewards
        print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.episode_timestep,
                                                                             reward=r.episode_rewards[-1]))
        if r.episode % 1000 == 0:
            agent.save_model(('./{}').format(EXPERIMENT_NAME), False)
            try:
                prev_data = pickle.load(open(EXPERIMENT_NAME, "rb"))
                prev_len = len(prev_data[0])
                prev_data[0].extend(rewards)
                rewards = []
                prev_data[1].extend(episodes)
                episodes = []
                pickle.dump(prev_data, open(EXPERIMENT_NAME, "wb"))
            except (OSError, IOError) as e:
                pickle.dump([rewards, episodes], open(EXPERIMENT_NAME, "wb"))
        if r.episode_rewards[-1] >= 5:
            print()
            print()
            print()
            print("WINNER WINNER CHICKEN DINNER")
        episodes.append(r.episode)
        rewards.append(r.episode_rewards[-1])
        return True

    # Restore, Train, and Save Model
    if args.test or args.resume: # If test, change settings and restore model
        agent.restore_model('./','PPO_K_someS_500batch_biggerreward_99dis')
    runner.run(episodes=EPISODES, max_episode_timesteps=2000, episode_finished=episode_finished, deterministic=False)

    if not args.test:
        agent.save_model(('./{}').format(EXPERIMENT_NAME), False)
    print("Stats: ", runner.episode_rewards[-5:], runner.episode_timesteps[-5:])

    #Dump reward values
    try:
        prev_data = pickle.load(open(EXPERIMENT_NAME, "rb"))
        prev_len = len(prev_data[0])
        prev_data[0].extend(rewards)
        prev_data[1].extend(episodes)
        print(episodes)
        pickle.dump(prev_data, open(EXPERIMENT_NAME, "wb"))
    except (OSError, IOError) as e:
        pickle.dump([rewards, episodes], open(EXPERIMENT_NAME, "wb"))

    try:
        runner.close()
    except AttributeError as e:
        pass
Esempio n. 20
0
def main():
    # Print all possible environments in the Pommerman registry
    print(pommerman.registry)

    config = ffa_v1_env()
    env = Pomme(**config["env_kwargs"])

    # Add 3 agents
    agents = {}
    for agent_id in range(4):
        agents[agent_id] = SimpleAgent(config["agent"](agent_id,
                                                       config["game_type"]))

    # agents[3] = PlayerAgent(config["agent"](agent_id, config["game_type"]), "arrows")

    env.set_agents(list(agents.values()))
    env.set_init_game_state(None)

    demo = []

    # Run the episodes just like OpenAI Gym
    for i_episode in range(1):
        state = env.reset()
        done = False
        demo.append(env.get_json_info())
        while not done:
            env.render()
            actions = env.act(state)
            state, reward, done, info = env.step(actions)
            demo.append(env.get_json_info())
        if 1 in reward:
            winner = reward.index(1)
        else:
            winner = None

        print('Episode {} finished'.format(i_episode))
    env.close()

    # If game not tied, save demonstration
    if winner is not None:
        demonstration = {'demo': demo, 'winner': winner}
        pickle.dump(demonstration, open("demonstration.p", "wb"))
Esempio n. 21
0
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
from ray.rllib.models import ModelCatalog

import pommerman
from pommerman import agents
from pommerman import configs
from pommerman import constants
from pommerman.envs.v0 import Pomme
from models.third_model import ActorCriticModel
from envs import v0

ray.init(num_cpus=5, num_gpus=1)

env_config = configs.phase_0_team_v0_env()
env = Pomme(**env_config['env_kwargs'])
act_space = env.action_space
ModelCatalog.register_custom_model("torch_conv", ActorCriticModel)
agent_names = ["ppo_agent_1", "ppo_agent_2"]

ppo_agent = PPOTrainer(config={
    "env_config": {
        "agent_names": agent_names,
        "env_id": "Mines-PommeTeam-v0",
        "phase": 0
    },
    "num_workers": 0,
    "num_gpus": 0,
    "multiagent": {
        "policies": {
            "ppo_policy": (PPOTFPolicy, obs_space, act_space, {
import os
import sys
import numpy as np

from pommerman.agents import SimpleAgent, RandomAgent, PlayerAgent, BaseAgent
from pommerman.configs import ffa_v1_env
from pommerman.envs.v0 import Pomme
from pommerman.characters import Bomber
from pommerman import utility

config = ffa_v1_env()
env = Pomme(**config["env_kwargs"])
env.seed(0)
Esempio n. 23
0
                              kick_map, skull_map, bomb_blast_strength,
                              bomb_life], axis=2).flatten()
        obs = np.append(obs, [ammo, blast_strength, can_kick])
        return obs

    def act(self, obs, action_space=None):
        obs = self.featurize(obs)
        obs = np.array([obs])
        predictions = self.model.predict(obs)
        print(np.argmax(predictions))
        print(predictions)
        return np.argmax(predictions)

# Instantiate the environment
config = ffa_v0_env()
env = Pomme(**config["env_kwargs"])
env.seed(0)

agent_pos = 2
# Add 3 random agents
agents = []
for agent_id in range(4):
    if agent_id == agent_pos:
        # agents.append(Cnn12833Dense1281(env.action_space.n, BOARD_SIZE, character=config["agent"](agent_id, config["game_type"]),
        #                       save_path=model_path))
        # agents.append(Dense82(env.action_space.n, BOARD_SIZE, character=config["agent"](agent_id, config["game_type"]),
        #                       save_path=model_path2))
        # agents.append(Dense128(env.action_space.n, BOARD_SIZE, character=config["agent"](agent_id, config["game_type"]),
        #                       save_path=model_path3))
        # agents.append(Dense128(env.action_space.n, BOARD_SIZE, character=config["agent"](agent_id, config["game_type"]),
        #                        save_path='./dqn/model/ddgp_dense_128_1_rs/model.h4'))
Esempio n. 24
0
                all_actions = self.env.act(obs)
                obs, reward, done, _ = self.env.step(all_actions)
                episode_steps += 1

                observations.append(obs)
                actions.append(all_actions)
                rewards.append(reward)
                dones.append(done)

            print('rollout %i/%i' % (i + 1, num_rollouts))
        return np.array(observations), np.array(
            to_categorical(
                actions,
                self.env.action_space.n)), np.array(rewards), np.array(dones)


# Instantiate the environment
config = ffa_competition_env()
env = Pomme(**config["env_kwargs"])

# Generate training data
stimulator = Stimulator(env, config)
observations, actions, rewards, dones = stimulator.stimulate(
    num_rollouts=initial_rollouts)

np.save(train_data_path + train_data_obs, observations)
np.save(train_data_path + train_data_labels, actions)
np.save(train_data_path + train_data_reward, rewards)
np.save(train_data_path + train_data_done, dones)

#print(np.sum(training_data_labels, axis=0) / np.sum(training_data_labels))
Esempio n. 25
0
class PomFFA(gym.Env):

    def __init__(self, env_config=None):

        self.agent_list = [HoldAgent(), agents.SimpleAgent(), HoldAgent(), HoldAgent()]
        # self.agent_list = [agents.SimpleAgent(), agents.SimpleAgent(), agents.SimpleAgent(), agents.RandomAgent()]
        self.all_obs = None
        self.all_action = None
        self.cur_obs = None
        self.alive_agents = [10, 11, 12, 13]
        self.player_agent_id = 10
        self.total_reward = 0

        pomme_config = pommerman.configs.ffa_competition_env()

        if env_config:
            for k, v in env_config.items():
                if k in pomme_config['env_kwargs']:
                    pomme_config['env_kwargs'][k] = v

        self.pomme = Pomme(**pomme_config['env_kwargs'])

        self.observation_space = self.init_observation_space(pomme_config['env_kwargs'])
        self.action_space = self.pomme.action_space

        if not env_config or (env_config and env_config.get("is_training", True)):
            # initialize env twice could raise error here.
            self.init(pomme_config)

    def init(self, pomm_config):
        for id_, agent in enumerate(self.agent_list):
            assert isinstance(agent, agents.BaseAgent)
            print(id_, pomm_config['game_type'])
            agent.init_agent(id_, pomm_config['game_type'])
        self.pomme.set_agents(self.agent_list)
        self.pomme.set_init_game_state(None)

    def reset(self):
        obs = self.pomme.reset()
        self.all_obs = obs.copy()
        obs = self.get_for_training_agent(obs)
        self.cur_obs = obs.copy()
        obs = self.preproess(obs)
        self.total_reward = 0
        return obs

    def get_reward(self, obs, action, agent_id):
        if len(obs["alive"]) == 1:
            # An agent won. Give them +1, others -1.
            if agent_id in obs['alive']:
                return 0.5
            else:
                return -0.5

        if obs["step_count"] >= 500:
            # Game is over from time. Everyone gets -1.
            return -0.5

        # Game running: 0 for alive, -1 for dead.
        if agent_id not in obs['alive']:
            return -0.5

        x, y = obs["position"]
        # blast = obs["bomb_blast_strength"]

        px = [0, 1, 0, -1]
        py = [1, 0, -1, 0]

        sum_reward = 0
        if action == 5:
            for i in range(4):
                tx = x+px[i]
                ty = y+py[i]
                if tx<0 or tx>10 or ty<0 or ty>10:
                    continue
                if obs["board"][tx][ty] == 1:
                    sum_reward += 1
                elif obs["board"][tx][ty] > 10:
                    sum_reward += 4

        sum_reward = sum_reward*1.0/200.0
        new_total_reward = self.total_reward + sum_reward
        if new_total_reward > 0.5 or new_total_reward < -0.5:
            sum_reward = 0
        else:
            self.total_reward = new_total_reward

        return sum_reward

    def step(self, action):
        actions = self.pomme.act(self.all_obs)
        if self.alive_agents and self.player_agent_id in self.alive_agents:
            actions = self.set_for_training_agent(actions, action)
        else:
            actions = self.set_for_training_agent(actions, 0)
        obs, rewards, done, info = self.pomme.step(actions)

        # print(obs)
        del self.all_obs
        self.all_obs = obs.copy()
        obs = self.get_for_training_agent(obs)
        del self.cur_obs
        self.cur_obs = obs.copy()
        reward = self.get_reward(self.cur_obs, action, self.player_agent_id)
        self.alive_agents = obs['alive']

        if self.player_agent_id not in self.alive_agents or self.cur_obs["step_count"] >= 500:
            done = True
        obs = self.preproess(obs)

        return obs, reward, done, {}

    def get_for_training_agent(self, inputs):
        order = self.player_agent_id - 10
        return inputs[order]

    def set_for_training_agent(self, inputs, value):
        order = self.player_agent_id - 10
        inputs[order] = value
        return inputs

    def init_observation_space(self, env_config):
        """
            observations for agents
            board: n^2
            bomb blast strength: n^2
            bomb life: n^2
        """
        board_size = env_config['board_size']
        num_items = env_config['num_items']
        # print("env config: {}".format(env_config))
        # board_size = 11

        board = spaces.Box(low=0, high=len(constants.Item), shape=(board_size, board_size))
        bomb_blast_strength = spaces.Box(low=0, high=num_items, shape=(board_size, board_size))
        bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size))
        flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size))
        position = spaces.Box(low=0, high=board_size, shape=(2,))
        blast_strength = spaces.Box(low=1, high=num_items, shape=(1,))
        ammo = spaces.Box(low=0, high=num_items, shape=(1,))
        return spaces.Dict({"board": board, "bomb_blast_strength": bomb_blast_strength, "bomb_life": bomb_life,
                            "flame_life": flame_life,
                            "position": position, "ammo": ammo, "blast_strength": blast_strength})

    @staticmethod
    def preproess(obs):
        del obs["game_type"]
        del obs["game_env"]
        del obs["can_kick"]
        del obs["teammate"]
        del obs["enemies"]
        del obs["step_count"]
        del obs['alive']
        del obs['bomb_moving_direction']
        obs['position'] = np.array(obs['position'])
        obs['ammo'] = np.array([obs['ammo']])
        obs['blast_strength'] = np.array([obs['blast_strength']])
        return obs

    def render(self):
        self.pomme.render()
Esempio n. 26
0
class PomFFA(gym.Env):
    agent_list = [
        agents.StaticAgent(),
        agents.StaticAgent(),
        agents.StaticAgent(),
        agents.StaticAgent()
    ]
    alive_agents = [10, 11, 12, 13]
    agent_id = 10
    ammo = 1
    blast_strength = 2
    state = {}

    def __init__(self, env_config={}):
        pomme_config = pommerman.configs.ffa_competition_env()
        self.reward = Reward(env_config.get("reward"))

        self.pomme = Pomme(**pomme_config['env_kwargs'])

        self.observation_space = self.init_observation_space(
            pomme_config['env_kwargs'])
        self.action_space = self.pomme.action_space

        if not env_config or (env_config
                              and env_config.get("is_training", True)):
            # initialize env twice could raise error here.
            self.init(pomme_config)

    def init(self, pomm_config):
        for id_, agent in enumerate(self.agent_list):
            assert isinstance(agent, agents.BaseAgent)
            agent.init_agent(id_, pomm_config['game_type'])
        self.pomme.set_agents(self.agent_list)
        self.pomme.set_init_game_state(None)
        self.init_state()

    def init_state(self):
        self.state['agent_id'] = self.agent_id
        self.state['alive'] = self.alive_agents
        self.state['visited'] = set()
        self.state['blast_strength'] = self.blast_strength
        self.state['ammo'] = self.ammo
        self.state["bombs"] = {}

    def reset(self):
        all_obs = self.pomme.reset()
        obs = self.get_for_training_agent(all_obs)
        self.init_state()

        self.state['prev_obs'] = copy.deepcopy(obs)
        self.state['all_obs'] = all_obs
        self.state['alive'] = obs['alive']

        obs = self.build_obs(obs, self.state)
        return obs

    def step(self, action):
        actions = self.pomme.act(self.state['all_obs'])
        actions = self.set_for_training_agent(actions, action)

        all_obs, _, _, _ = self.pomme.step(actions)
        obs = self.get_for_training_agent(all_obs)
        info = {'board': obs['board'], 'blast_strength': obs['blast_strength']}
        done = self.get_done(obs)
        reward, self.state = self.reward.get_reward(action, obs, self.state)

        self.state['prev_obs'] = copy.deepcopy(obs)
        self.state['all_obs'] = all_obs
        self.state['alive'] = obs['alive']
        self.state['blast_strength'] = obs['blast_strength']
        self.state['ammo'] = obs['ammo']

        obs = self.build_obs(obs, self.state)
        return obs, reward, done, info

    def get_for_training_agent(self, inputs):
        order = self.agent_id - 10
        return inputs[order]

    def set_for_training_agent(self, inputs, value):
        order = self.agent_id - 10
        inputs[order] = value
        return inputs

    def get_done(self, obs):
        if self.agent_id not in obs['alive']:
            return True
        if obs['step_count'] >= 800:
            return True
        return False

    def build_obs(self, obs, state):
        board = obs['board']
        bomb_blast_strength = obs['bomb_blast_strength']
        bomb_life = obs['bomb_life']
        flame_life = obs['flame_life']
        agent_id = state['agent_id']
        ammo = state['ammo']
        passage = np.zeros_like(board)
        wall = np.zeros_like(board)
        wood = np.zeros_like(board)
        bomb = np.zeros_like(board)
        bonus = np.zeros_like(board)
        me = np.zeros_like(board)
        enemy = np.zeros_like(board)
        for y in range(board.shape[0]):
            for x in range(board.shape[1]):
                v = board[y][x]
                if v == 0:
                    passage[y][x] = 1
                elif v == 1:
                    wall[y][x] = 1
                elif v == 2:
                    wood[y][x] = 1
                elif v == 3:
                    bomb = create_cross(bomb, (y, x),
                                        bomb_blast_strength[y][x])
                elif v == 4:
                    pass
                elif v == 6 or v == 7:
                    bonus[y][x] = 1
                elif v >= 10:
                    if v == agent_id:
                        me[y][x] = 1
                    else:
                        enemy[y][x] = 1
                    if bomb_blast_strength[y][x] > 0:
                        bomb = create_cross(bomb, (y, x),
                                            bomb_blast_strength[y][x])

        ammo = ammo * np.ones_like(board) / 12
        bomb_life /= 9
        flame_life /= 3
        board = np.transpose(
            np.stack([
                passage, wall, wood, bomb, bonus, me, enemy, bomb_life,
                flame_life, ammo
            ]), [1, 2, 0])
        return board

    @staticmethod
    def init_observation_space(env_config):
        """
            observations for agents
            board: n^2
            bomb blast strength: n^2
            bomb life: n^2
        """
        board_size = env_config['board_size']
        num_items = env_config['num_items']

        board = spaces.Box(
            low=0, high=1,
            shape=(board_size, board_size,
                   10))  # passage,wall,wood,bomb,bonus,me,enemies
        bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size))
        flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size))
        ammo = spaces.Box(low=0,
                          high=num_items,
                          shape=(board_size, board_size))
        # return spaces.Dict({"board": board, "bomb_life": bomb_life, "flame_life": flame_life,"ammo": ammo})
        return board

    @staticmethod
    def init_action_space():
        return spaces.Discrete(6)

    def render(self):
        self.pomme.render()
        all_actions = self.gym.act(obs)
        all_actions.insert(self.gym.training_agent, actions)
        state, reward, terminal, _ = self.gym.step(all_actions)
        agent_state = featurize(state[self.gym.training_agent])
        agent_reward = reward[self.gym.training_agent]
        return agent_state, terminal, agent_reward

    def reset(self):
        obs = self.gym.reset()
        agent_obs = featurize(obs[3])
        return agent_obs


# Instantiate the environment
config = ffa_v0_fast_env()
env = Pomme(**config["env_kwargs"])
env.seed(0)

# Create a Proximal Policy Optimization agent
agent = PPOAgent(
    states=dict(type='float', shape=env.observation_space.shape),
    actions=dict(type='int', num_values=env.action_space.n),
    network='auto',
    max_episode_timesteps=600,
    batch_size=100,
    learning_rate=1e-3,
    summarizer=dict(
        directory="./board5/",
        #steps=50,
        summaries='all'))
Esempio n. 28
0
    def setup(self):
        agents = []
        if self.phase == 0:
            arr = [0, 1]
            random.shuffle(arr)
            agents_index = arr.pop()
            op_index = arr.pop()
            self.agents_index = [agents_index]
            self.enemies_agents_index = [op_index]
            self.max_steps = 200
            config = ffa_v0_fast_env()
            config["env_kwargs"]["max_steps"] = self.max_steps
            agents.insert(
                agents_index,
                BaseLineAgent(config["agent"](agents_index,
                                              config["game_type"])))
            agents.insert(
                op_index,
                NoDoAgent(config["agent"](op_index, config["game_type"])))
            self.env = Pomme(**config["env_kwargs"])
            self.env.set_agents(agents)
            init_state = {
                'board_size': '11',
                'step_count': '0',
                'board': '',
                'agents':
                '[{"agent_id": 0, "is_alive": true, "position": [1, 1], "ammo": 1, "blast_strength": 2, "can_kick": false}, {"agent_id": 1, "is_alive": true, "position": [9, 0], "ammo": 1, "blast_strength": 2, "can_kick": false}]',
                'bombs': '[]',
                'flames': '[]',
                'items': '[]',
                'intended_actions': '[0, 0]'
            }
            board = np.full((11, 11), 0)
            init_state['board'] = json.dumps(board.tolist())
            agents_json = json.loads(copy.copy(init_state['agents']))
            random_pos = np.random.choice(board.shape[0], (2, 2),
                                          replace=False)
            agents_json[0]["position"] = random_pos[0].tolist()
            agents_json[1]["position"] = random_pos[1].tolist()
            init_state['agents'] = json.dumps(agents_json)
            self.env._init_game_state = init_state
            self.env.reset()

        if self.phase == 1:
            arr = [0, 1]
            random.shuffle(arr)
            agents_index = arr.pop()
            op_index = arr.pop()
            self.agents_index = [agents_index]
            self.enemies_agents_index = [op_index]
            self.max_steps = 200
            config = ffa_v0_fast_env()
            config["env_kwargs"]["max_steps"] = self.max_steps
            agents.insert(
                agents_index,
                BaseLineAgent(config["agent"](agents_index,
                                              config["game_type"])))
            agents.insert(
                op_index,
                NoDoAgent(config["agent"](op_index, config["game_type"])))
            self.env = Pomme(**config["env_kwargs"])
            self.env.set_agents(agents)
            init_state = {
                'board_size': '11',
                'step_count': '0',
                'board': '',
                'agents':
                '[{"agent_id": 0, "is_alive": true, "position": [1, 1], "ammo": 1, "blast_strength": 2, "can_kick": false}, {"agent_id": 1, "is_alive": true, "position": [9, 0], "ammo": 1, "blast_strength": 2, "can_kick": false}]',
                'bombs': '[]',
                'flames': '[]',
                'items': '[]',
                'intended_actions': '[0, 0]'
            }
            board = np.full((11, 11), 0)
            board[5, :] = (np.ones(11) * 2)
            agents_json = json.loads(copy.copy(init_state['agents']))
            agents_json[0]["position"] = [
                random.randint(0, 4),
                random.randint(0, 10)
            ]
            agents_json[1]["position"] = [
                random.randint(6, 10),
                random.randint(0, 10)
            ]
            init_state['agents'] = json.dumps(agents_json)
            init_state['board'] = json.dumps(board.tolist())
            self.env._init_game_state = init_state
            self.env.reset()

        self.observation_space = spaces.Dict({
            'boards':
            spaces.Box(low=-1, high=25, shape=(11, 11, 18), dtype=np.float32),
            'states':
            spaces.Box(low=-1, high=25, shape=(8, ), dtype=np.float32)
        })

        self.action_space = self.env.action_space
Esempio n. 29
0
        teammate = teammate.value
    else:
        teammate = -1
    teammate = make_np_float([teammate])

    enemies = obs["enemies"]
    enemies = [e.value for e in enemies]
    if len(enemies) < 3:
        enemies = enemies + [-1]*(3 - len(enemies))
    enemies = make_np_float(enemies)

    return np.concatenate((board, bomb_blast_strength, bomb_life, position, ammo, blast_strength, can_kick, teammate, enemies))

# Instantiate the environment
config = ffa_v0_fast_env()
env = Pomme(**config["env_kwargs"])
env.action_space.n
# Add 3 random agents
agents = {}
for agent_id in range(3):
    agents[agent_id] = StaticAgent(config["agent"](agent_id, config["game_type"]))

# Add human agent

agent_id += 1
agents[3] = PlayerAgent(config["agent"](agent_id, config["game_type"]), "arrows")

env.set_agents(list(agents.values()))
env.set_init_game_state(None)

Esempio n. 30
0
class MultiAgend(MultiAgentEnv):
    def __init__(self):
        super(MultiAgend, self).__init__()
        self.phase = 0
        self.next_phase = 0
        self.steps = 0
        self.last_featurize_obs = None
        self.setup()

    def featurize(self, obs):

        enemies = []
        for agent_id in self.enemies_agents_index:
            if agent_id == 0:
                enemies.append(Item.Agent0)
            if agent_id == 1:
                enemies.append(Item.Agent1)
            if agent_id == 2:
                enemies.append(Item.Agent2)
            if agent_id == 3:
                enemies.append(Item.Agent3)

        for enemie in obs["enemies"]:
            if enemie not in enemies:
                obs["board"] = ma.masked_equal(
                    obs["board"], enemie.value).filled(fill_value=0)

        board = np.copy(obs["board"])
        board[obs["position"][0], obs["position"][1]] = 0.0
        enemie_pos = np.full((11, 11), 0)
        for enemie in obs["enemies"]:
            enemie_pos = enemie_pos | ma.masked_not_equal(
                board, enemie.value).filled(fill_value=0)
            board = ma.masked_equal(board, enemie.value).filled(fill_value=0)

        wood = ma.masked_not_equal(board, 2).filled(fill_value=0)
        wood = (wood > 0).astype(np.float32)
        board = ma.masked_equal(board, 2).filled(fill_value=0)

        stone = ma.masked_not_equal(board, 1).filled(fill_value=0)
        stone = (stone > 0).astype(np.float32)
        board = ma.masked_equal(board, 1).filled(fill_value=0)
        enemie_pos = (enemie_pos > 0).astype(np.float32)

        board = ma.masked_equal(board,
                                obs["teammate"].value).filled(fill_value=0)

        flames = ma.masked_not_equal(board, 4).filled(fill_value=0)
        flames = (flames > 0).astype(np.float32)

        board = ma.masked_equal(board, 4).filled(fill_value=0)
        board = ma.masked_equal(board, 3).filled(fill_value=0)

        teammate_pos = ma.masked_not_equal(
            board, obs["teammate"].value).filled(fill_value=0)
        teammate_pos = (teammate_pos > 0).astype(np.float32)
        board = ma.masked_equal(board,
                                obs["teammate"].value).filled(fill_value=0)
        items = board.astype(np.float32)

        pos = np.full((11, 11), 0)
        pos[obs["position"][0], obs["position"][1]] = 1.0
        pos = pos.astype(np.float32)

        bomb_life = obs["bomb_life"].astype(np.float32)
        bomb_blast_strength = obs["bomb_blast_strength"].astype(np.float32)

        ammo = utility.make_np_float([obs["ammo"]])
        blast_strength = utility.make_np_float([obs["blast_strength"]])
        can_kick = utility.make_np_float([obs["can_kick"]])
        game_end = utility.make_np_float([
            (self.max_steps - self.steps) / self.max_steps
        ])

        actual_featurize_obs = {
            'boards':
            np.stack([
                enemie_pos, pos, wood, stone, items, flames, teammate_pos,
                bomb_life, bomb_blast_strength
            ],
                     axis=0),
            'states':
            np.concatenate([ammo, blast_strength, can_kick, game_end]),
        }

        if self.last_featurize_obs == None:
            featurize_obs = {
                'boards':
                np.concatenate([
                    actual_featurize_obs['boards'],
                    actual_featurize_obs['boards']
                ],
                               axis=0),
                'states':
                np.concatenate([
                    actual_featurize_obs['states'],
                    actual_featurize_obs['states']
                ]),
            }
        else:
            featurize_obs = {
                'boards':
                np.concatenate([
                    self.last_featurize_obs['boards'],
                    actual_featurize_obs['boards']
                ],
                               axis=0),
                'states':
                np.concatenate([
                    self.last_featurize_obs['states'],
                    actual_featurize_obs['states']
                ]),
            }

        self.last_featurize_obs = actual_featurize_obs
        return featurize_obs

    def setup(self):
        agents = []
        if self.phase == 0:
            arr = [0, 1]
            random.shuffle(arr)
            agents_index = arr.pop()
            op_index = arr.pop()
            self.agents_index = [agents_index]
            self.enemies_agents_index = [op_index]
            self.max_steps = 200
            config = ffa_v0_fast_env()
            config["env_kwargs"]["max_steps"] = self.max_steps
            agents.insert(
                agents_index,
                BaseLineAgent(config["agent"](agents_index,
                                              config["game_type"])))
            agents.insert(
                op_index,
                NoDoAgent(config["agent"](op_index, config["game_type"])))
            self.env = Pomme(**config["env_kwargs"])
            self.env.set_agents(agents)
            init_state = {
                'board_size': '11',
                'step_count': '0',
                'board': '',
                'agents':
                '[{"agent_id": 0, "is_alive": true, "position": [1, 1], "ammo": 1, "blast_strength": 2, "can_kick": false}, {"agent_id": 1, "is_alive": true, "position": [9, 0], "ammo": 1, "blast_strength": 2, "can_kick": false}]',
                'bombs': '[]',
                'flames': '[]',
                'items': '[]',
                'intended_actions': '[0, 0]'
            }
            board = np.full((11, 11), 0)
            init_state['board'] = json.dumps(board.tolist())
            agents_json = json.loads(copy.copy(init_state['agents']))
            random_pos = np.random.choice(board.shape[0], (2, 2),
                                          replace=False)
            agents_json[0]["position"] = random_pos[0].tolist()
            agents_json[1]["position"] = random_pos[1].tolist()
            init_state['agents'] = json.dumps(agents_json)
            self.env._init_game_state = init_state
            self.env.reset()

        if self.phase == 1:
            arr = [0, 1]
            random.shuffle(arr)
            agents_index = arr.pop()
            op_index = arr.pop()
            self.agents_index = [agents_index]
            self.enemies_agents_index = [op_index]
            self.max_steps = 200
            config = ffa_v0_fast_env()
            config["env_kwargs"]["max_steps"] = self.max_steps
            agents.insert(
                agents_index,
                BaseLineAgent(config["agent"](agents_index,
                                              config["game_type"])))
            agents.insert(
                op_index,
                NoDoAgent(config["agent"](op_index, config["game_type"])))
            self.env = Pomme(**config["env_kwargs"])
            self.env.set_agents(agents)
            init_state = {
                'board_size': '11',
                'step_count': '0',
                'board': '',
                'agents':
                '[{"agent_id": 0, "is_alive": true, "position": [1, 1], "ammo": 1, "blast_strength": 2, "can_kick": false}, {"agent_id": 1, "is_alive": true, "position": [9, 0], "ammo": 1, "blast_strength": 2, "can_kick": false}]',
                'bombs': '[]',
                'flames': '[]',
                'items': '[]',
                'intended_actions': '[0, 0]'
            }
            board = np.full((11, 11), 0)
            board[5, :] = (np.ones(11) * 2)
            agents_json = json.loads(copy.copy(init_state['agents']))
            agents_json[0]["position"] = [
                random.randint(0, 4),
                random.randint(0, 10)
            ]
            agents_json[1]["position"] = [
                random.randint(6, 10),
                random.randint(0, 10)
            ]
            init_state['agents'] = json.dumps(agents_json)
            init_state['board'] = json.dumps(board.tolist())
            self.env._init_game_state = init_state
            self.env.reset()

        self.observation_space = spaces.Dict({
            'boards':
            spaces.Box(low=-1, high=25, shape=(11, 11, 18), dtype=np.float32),
            'states':
            spaces.Box(low=-1, high=25, shape=(8, ), dtype=np.float32)
        })

        self.action_space = self.env.action_space

    def set_phase(self, phase):
        print("learn phase " + str(phase))
        self.next_phase = phase

    def close(self):
        self.env.close()

    def step(self, actions):
        self.steps = self.steps + 1
        obs = self.env.get_observations()
        all_actions = self.env.act(obs)
        assert (len(all_actions) == len(self.agents_index) +
                len(self.enemies_agents_index))

        for index in self.agents_index:
            try:
                action = actions[index]
            except:
                action = 0
            assert (all_actions[index] == None)
            all_actions[index] = action

        step_obs = self.env.step(all_actions)
        obs, rew, done, info = {}, {}, {}, {}
        for i in actions.keys():
            obs[i], rew[i], done[i], info[i] = [
                self.featurize(step_obs[0][i]), step_obs[1][i],
                step_obs[1][i] == -1 or step_obs[2], step_obs[3]
            ]

        done["__all__"] = step_obs[2]
        return obs, rew, done, info

    def reset(self):
        self.steps = 0
        self.phase = self.next_phase
        self.setup()
        obs = self.env.get_observations()
        return {i: self.featurize(obs[i]) for i in self.agents_index}