Beispiel #1
0
    def log_returns(self,
                    episode_id: str,
                    reward_dict: MultiAgentDict,
                    info_dict: MultiAgentDict = None,
                    multiagent_done_dict: MultiAgentDict = None) -> None:
        """Record returns from the environment.

        The reward will be attributed to the previous action taken by the
        episode. Rewards accumulate until the next action. If no reward is
        logged before the next action, a reward of 0.0 is assumed.

        Args:
            episode_id: Episode id returned from start_episode().
            reward_dict: Reward from the environment agents.
            info_dict: Optional info dict.
            multiagent_done_dict: Optional done dict for agents.
        """

        episode = self._get(episode_id)

        # Accumulate reward by agent.
        # For existing agents, we want to add the reward up.
        for agent, rew in reward_dict.items():
            if agent in episode.cur_reward_dict:
                episode.cur_reward_dict[agent] += rew
            else:
                episode.cur_reward_dict[agent] = rew

        if multiagent_done_dict:
            for agent, done in multiagent_done_dict.items():
                episode.cur_done_dict[agent] = done

        if info_dict:
            episode.cur_info_dict = info_dict or {}
Beispiel #2
0
 def observe(self, obs: MultiAgentDict, rewards: MultiAgentDict,
             dones: MultiAgentDict, infos: MultiAgentDict):
     self.last_obs = obs
     for ag, r in rewards.items():
         if ag in self.last_rewards:
             self.last_rewards[ag] += r
         else:
             self.last_rewards[ag] = r
     for ag, d in dones.items():
         if ag in self.last_dones:
             self.last_dones[ag] = self.last_dones[ag] or d
         else:
             self.last_dones[ag] = d
     self.last_infos = infos
    def step(
        self, action_dict: MultiAgentDict
    ) -> Tuple[dict, dict, Dict[Union[str, Any], Union[bool, Any]], dict]:
        hunters = []
        n = 0
        #print(len(self.agents))
        observation, reward, done, reproduce = {}, {}, {}, {}
        alive = []
        #print(len(action_dict), action_dict)
        for i, action in action_dict.items():
            if not i in self.dones:
                observation[i], reward[i], done[i], reproduce[i] = self.agents[
                    i].step(action)
                if done[i]:
                    self.dones.append(i)
                alive.append(i)

        for i in alive:
            # print("len", observation, action_dict[0], reward)
            if not i in self.dones:
                if reproduce[i]:
                    new_hunter = HunterEnv()
                    observation[len(self.agents)] = new_hunter.reset()
                    reward[len(self.agents)] = 0
                    done[len(self.agents)] = False
                    reproduce[len(self.agents)] = False
                    self.agents.append(new_hunter)
        done["__all__"] = len(self.dones) == len(self.agents)
        #print(observation)
        self.alive = len(observation)
        return observation, reward, done, reproduce
Beispiel #4
0
    def observation_space_contains(self, x: MultiAgentDict) -> bool:
        """Checks if the observation space contains the given key.

        Args:
            x: Observations to check.

        Returns:
            True if the observation space contains the given all observations
                in x.
        """
        if (not hasattr(self, "_spaces_in_preferred_format")
                or self._spaces_in_preferred_format is None):
            self._spaces_in_preferred_format = (
                self._check_if_space_maps_agent_id_to_sub_space())
        if self._spaces_in_preferred_format:
            for key, agent_obs in x.items():
                if not self.observation_space[key].contains(agent_obs):
                    return False
            if not all(k in self.observation_space for k in x):
                if log_once(
                        "possibly_bad_multi_agent_dict_missing_agent_observations"
                ):
                    logger.warning(
                        "You environment returns observations that are "
                        "MultiAgentDicts with incomplete information. "
                        "Meaning that they only contain information on a subset of"
                        " participating agents. Ignore this warning if this is "
                        "intended, for example if your environment is a turn-based "
                        "simulation.")
            return True

        logger.warning("observation_space_contains() has not been implemented")
        return True
Beispiel #5
0
    def step(
        self, action_dict: MultiAgentDict
    ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]:

        processed_action_dict = {}
        for agent_id, action in action_dict.items():
            if agent_id in self.agents_to_action_converters:
                convertor: RestrictedToBaseGameActionSpaceConverter = self.agents_to_action_converters[
                    agent_id]
                base_game_action, _, _ = convertor.get_base_game_action(
                    obs=self._agents_to_current_obs[agent_id],
                    restricted_game_action=action,
                    use_delegate_policy_exploration=self.
                    _use_delegate_policy_exploration,
                    clip_base_game_actions=self._clip_base_game_actions,
                    delegate_policy_state=None)
                processed_action_dict[agent_id] = base_game_action
            else:
                processed_action_dict[agent_id] = action

        obs, rews, dones, infos = self.base_env.step(
            action_dict=processed_action_dict)

        for agent_id, observation in obs.items():
            self._agents_to_current_obs[agent_id] = observation

        return obs, rews, dones, infos
    def step(self, action_dict: MultiAgentDict) -> Tuple[
        MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]:
        obs = {}
        rewards = {}
        dones = {}
        infos = {}

        self.current_step += 1

        # Determine which agent gets to act first
        random.shuffle(self.active_agents)

        for agent in self.active_agents:
            self.perform_agent_action(agent, self.available_actions[action_dict[agent.name]])

        # Update arena after handling actions from agents
        self.collect_coins()
        self.update_bombs()
        self.evaluate_explosions()

        # Set obs, reward, done, info for agents still alive
        # Agents that died during this step will get their next obs, reward, done, info later when the round finishes
        for agent in self.active_agents:
            rewards[agent.name] = agent.aux_score
            rewards[agent.name] -= np.average([v.aux_score for k, v in self.agents.items() if k != agent.name])
            agent.store_game_state(self.get_state_for_agent(agent))
            dones[agent.name] = False
            obs[agent.name] = get_kill_observation_from_game_state(agent.last_game_state)
            infos[agent.name] = agent.score

        for agent_name in action_dict.keys():
            if agent_name not in map(lambda a: a.name, self.active_agents):
                agent = self.agents[agent_name]
                agent.store_game_state(self.get_state_for_agent(agent))
                self.agents_last_obs[agent_name] = get_kill_observation_from_game_state(agent.last_game_state)

        for agent in self.agents.values():
            if agent.dead:
                agent.penalty += agent.aux_score
                agent.penalty -= np.average([v.aux_score for k, v in self.agents.items() if k != agent.name])

        for agent in self.agents.values():
            agent.aux_score = 0

        if self.done():
            self.end_round()
            dones['__all__'] = True
            for a in self.agents.values():
                # Add observation for agents that died ealier
                if a not in self.active_agents:
                    rewards[a.name] = a.penalty
                    obs[a.name] = self.agents_last_obs[a.name]
                dones[a.name] = True
                infos[a.name] = a.score
        else:
            dones['__all__'] = False
        return obs, rewards, dones, infos
    def step(self, action_dict: MultiAgentDict) -> Tuple[
        MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]:
        obs = {}
        rewards = {}
        dones = {}
        infos = {}
        for agent_name, action in action_dict.items():
            if agent_name.endswith('_high'):
                agent = self.flat_env.agents[agent_name]
                action_name = HIGH_LEVEL_ACTIONS[action]
                agent.high_level_steps += 1
                agent_id = f'{agent.low_level_prefix}{action_name}_{agent_name}_{agent.high_level_steps}'
                if action_name == 'COLLECT':
                    obs.update({agent_id : get_collect_observation_from_game_state(agent.last_game_state)})
                elif action_name == 'DESTROY':
                    obs.update({agent_id : get_destroy_observation_from_game_state(agent.last_game_state)})
                elif action_name == 'KILL':
                    obs.update({agent_id : get_kill_observation_from_game_state(agent.last_game_state)})
                else:
                    raise Exception()
                rewards.update({agent_id: 0})
                dones.update({agent_id: False })
                self.high_low_mapping[agent_name] = agent_id
                agent.current_mode = action_name
                agent.current_sub_id = agent_id
                #print(f'Agent {agent_name} now {action_name}')
            else:
                #agent_1_high
                #agent_1_low_1
                #agent_2_low_5
                agent_parts = agent_name.split('_')
                high_level_agent_name = f'{agent_parts[2]}_{agent_parts[3]}_high'
                self.action_buffer[high_level_agent_name] = action
                #print(f'Add to buffer: Agent {high_level_agent_name} - Action {action}')

                #agent = self.flat_env.agents[high_level_agent_name]

        if len(self.action_buffer) == len(self.flat_env.active_agents):
            obs, rewards, dones, infos = self.flat_env.step(self.action_buffer)

            self.action_buffer = {}
            pass
        else:
            dones.update({'__all__' : False})

        return obs, rewards, dones, infos
Beispiel #8
0
    def step(
        self, action_dict: MultiAgentDict
    ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]:

        for agent_id, action in action_dict.items():
            if self._agents_to_current_valid_actions_mask[
                    agent_id] is not None:
                assert self._agents_to_current_valid_actions_mask[agent_id][action] == 1.0, f"\nagent is {agent_id} " \
                                                                                            f"action is {action}" \
                                                                                            f"rstr valid_actions are {self._agents_to_current_valid_actions_mask[agent_id]}"

        base_obs_dict, rews, dones, infos = self.base_env.step(
            action_dict=action_dict)

        restricted_game_obs = self._convert_obs_to_restricted_game(
            base_game_obs_dict=base_obs_dict, dones=dones)

        return restricted_game_obs, rews, dones, infos
Beispiel #9
0
    def _convert_obs_to_restricted_game(self,
                                        base_game_obs_dict: MultiAgentDict,
                                        dones):
        obs_dict_out = {}

        self._agents_to_current_valid_actions_mask = {
            agent: None
            for agent in range(2)
        }

        for agent_id, base_game_obs in base_game_obs_dict.items():
            if agent_id in self.agent_conversions:
                if not dones["__all__"]:
                    base_game_obs_as_tuple = tuple(base_game_obs)
                    try:
                        restricted_game_obs = self.agent_conversions[
                            agent_id].orig_obs_to_restricted_game_obs[
                                base_game_obs_as_tuple]
                        # assert len(restricted_game_obs) == 90, "only needs to be true for 20x dummy leduc"
                    except KeyError:
                        assert isinstance(base_game_obs_as_tuple, tuple)
                        assert base_game_obs_as_tuple[0] == \
                               list(self.agent_conversions[agent_id].orig_obs_to_restricted_game_obs.keys())[0][
                                   0], f"key provided is {base_game_obs_as_tuple}\n agent id is {agent_id} \n example key is {list(self.agent_conversions[agent_id].orig_obs_to_restricted_game_obs.keys())[0]}"
                        assert len(base_game_obs_as_tuple) == len(
                            list(self.agent_conversions[agent_id].
                                 orig_obs_to_restricted_game_obs.keys())[0]
                        ), f"{len(base_game_obs_as_tuple)} {len(list(self.agent_conversions[agent_id].orig_obs_to_restricted_game_obs.keys())[0])}"
                        print(
                            f"keys are: {self.agent_conversions[agent_id].orig_obs_to_restricted_game_obs.keys()}\n\nlooking for {base_game_obs_as_tuple}"
                        )
                        raise
                    self._agents_to_current_valid_actions_mask[agent_id] = \
                    self.agent_conversions[agent_id].orig_obs_to_restricted_game_valid_actions_mask[
                        base_game_obs_as_tuple]
                    obs_dict_out[agent_id] = restricted_game_obs
                else:
                    restricted_game_obs = np.zeros(
                        shape=self.observation_space.shape, dtype=np.float32)
                    restricted_game_obs[:len(base_game_obs)] = base_game_obs
                    obs_dict_out[agent_id] = restricted_game_obs
            else:
                obs_dict_out[agent_id] = base_game_obs
        return obs_dict_out
Beispiel #10
0
    def step(
        self, action_dict: MultiAgentDict
    ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]:
        #print(len(self.agents))
        observation, reward, done, reproduce = {}, {}, {}, {}
        alive = []
        #print(len(action_dict), action_dict)
        for id, action in action_dict.items():
            #i = int(id.split('_')[1])

            if not id in self.dones:
                observation[id], reward[id], done[id], reproduce[
                    id] = self.agents[id].step(action)
                if done[id]:
                    self.dones.append(id)
                alive.append(id)
                # else:
                #     observation[id], reward[id], done[id], reproduce[id] = self.prey_agents[id].step(action)
                #     if done[id]:
                #         self.dones.append(id)
                #     alive.append(id)

        for id in alive:
            # print("len", observation, action_dict[0], reward)
            if not id in self.dones:
                if reproduce[id]:
                    if "hunter" in id:
                        self.hunter_count += 1
                        new_agent = HunterEnv()
                        new_id = "hunter_" + str(self.hunter_count)
                    else:
                        self.prey_count += 1
                        new_agent = PreyEnv()
                        new_id = "prey_" + str(self.prey_count)

                    observation[new_id] = new_agent.reset()
                    reward[new_id] = 0
                    done[new_id] = False
                    reproduce[new_id] = False
                    self.agents[new_id] = new_agent
        done["__all__"] = len(self.dones) == len(self.agents)
        #print(observation)
        self.alive = len(observation)
        return observation, reward, done, reproduce
Beispiel #11
0
 def observation_space_contains(self, x: MultiAgentDict) -> bool:
     if not isinstance(x, dict):
         return False
     return all(
         self.observation_space.contains(val) for val in x.values())
    def step(
        self, action_dict: MultiAgentDict
    ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]:
        obs = {}
        rewards = {}
        dones = {}
        infos = {}

        self.current_step += 1

        # Determine which agent gets to act first
        random.shuffle(self.active_agents)

        for agent in self.active_agents:
            self.perform_agent_action(
                agent, self.available_actions[action_dict[agent.name]])

        # Update arena after handling actions from agents
        self.collect_coins()
        self.update_bombs()
        self.evaluate_explosions()

        # Set obs, reward, done, info for agents still alive
        # Agents that died during this step will get their next obs, reward, done, info later when the round finishes
        for agent in self.active_agents:
            rewards[agent.name] = agent.aux_score
            rewards[agent.name] -= np.average([
                v.aux_score for k, v in self.agents.items() if k != agent.name
            ])
            #agent.crates_destroyed = 0# Reset aux score
            agent.store_game_state(self.get_state_for_agent(agent))
            dones[agent.name] = False
            obs[agent.name] = get_observation_from_game_state(
                agent.last_game_state, self.agents.keys())
            infos[agent.name] = agent.score

        for agent_name in action_dict.keys():
            if agent_name not in map(lambda a: a.name, self.active_agents):
                agent = self.agents[agent_name]
                agent.store_game_state(self.get_state_for_agent(agent))
                self.agents_last_obs[
                    agent_name] = get_observation_from_game_state(
                        agent.last_game_state, self.agents.keys())

        for agent in self.agents.values():
            if agent.dead:
                agent.penalty += agent.aux_score
                agent.penalty -= np.average([
                    v.aux_score for k, v in self.agents.items()
                    if k != agent.name
                ])

        for agent in self.agents.values():
            agent.aux_score = 0

        if self.done():
            self.end_round()
            dones['__all__'] = True
            # Determine winner and losers
            # winner can only contain a single agent with the highest score
            # loser contains agents without the highest score
            winner, loser = self.get_winner_loser()
            for a in self.agents.values():
                #rewards[a.name] = 0
                # Add observation for agents that died ealier
                if a not in self.active_agents:
                    rewards[a.name] = a.penalty
                    #a.store_game_state(self.get_state_for_agent(a))
                    #obs[a.name] = get_observation_from_game_state(a.last_game_state, self.agents.keys())
                    obs[a.name] = self.agents_last_obs[a.name]
                # Add rewards for all agents based on their final score
                #if a.name in winner:
                #rewards[a.name] = 3. / 3**(len(winner)-1)
                #rewards[a.name] -= np.average([v.score + v.crates_destroyed * 0.05 for k, v in self.agents.items() if k != a.name])
                #elif a.name in loser:
                #    rewards[a.name] = -1.
                #else:
                #    rewards[a.name] = 0.
                dones[a.name] = True
                infos[a.name] = a.score
        else:
            dones['__all__'] = False
        return obs, rewards, dones, infos
    def step(
        self, action_dict: MultiAgentDict
    ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]:
        obs = {}
        rewards = {}
        dones = {}
        infos = {}

        self.current_step += 1

        # Determine which agent gets to act first
        random.shuffle(self.active_agents)

        for agent in self.active_agents:
            #print(f'Agent {agent.name} {agent.current_sub_id} - Action {LOW_LEVEL_ACTIONS[action_dict[agent.name]]}')
            self.perform_agent_action(
                agent, LOW_LEVEL_ACTIONS[action_dict[agent.name]])

        # Update arena after handling actions from agents
        self.collect_coins()
        self.update_bombs()
        self.evaluate_explosions()

        # Set obs, reward, done, info for agents still alive
        # Agents that died during this step will get their next obs, reward, done, info later when the round finishes
        for agent in self.active_agents:
            agent.store_game_state(self.get_state_for_agent(agent))
            agent.low_level_steps += 1
            if agent.current_mode == "COLLECT":
                obs[agent.
                    current_sub_id] = get_collect_observation_from_game_state(
                        agent.last_game_state)
                if agent.high_level_aux_score == 1:
                    rewards[agent.current_sub_id] = 1.0
                    dones[agent.current_sub_id] = True
                    agent.crates_destroyed = 0
                    agent.low_level_steps = 0
                elif agent.low_level_steps >= agent.max_low_level_steps:
                    dones[agent.current_sub_id] = True
                    rewards[agent.current_sub_id] = -1.0
                    agent.low_level_steps = 0
                else:
                    rewards[agent.current_sub_id] = -0.01
                    dones[agent.current_sub_id] = False
            elif agent.current_mode == "KILL":
                obs[agent.
                    current_sub_id] = get_kill_observation_from_game_state(
                        agent.last_game_state)
                if agent.high_level_aux_score >= 5:
                    rewards[agent.current_sub_id] = 1.0
                    dones[agent.current_sub_id] = True
                    agent.low_level_steps = 0
                elif agent.low_level_steps >= agent.max_low_level_steps:
                    dones[agent.current_sub_id] = True
                    rewards[agent.current_sub_id] = -1.0
                    agent.low_level_steps = 0
                else:
                    rewards[
                        agent.current_sub_id] = agent.crates_destroyed * 0.01
                    rewards[agent.current_sub_id] -= 0.01
                    dones[agent.current_sub_id] = False
            elif agent.current_mode == "DESTROY":
                obs[agent.
                    current_sub_id] = get_destroy_observation_from_game_state(
                        agent.last_game_state)
                if agent.crates_destroyed > 0:
                    rewards[agent.current_sub_id] = 1.0
                    dones[agent.current_sub_id] = True
                    agent.low_level_steps = 0
                elif agent.low_level_steps >= agent.max_low_level_steps:
                    dones[agent.current_sub_id] = True
                    rewards[agent.current_sub_id] = -1.0
                    agent.low_level_steps = 0
                else:
                    rewards[agent.current_sub_id] = -0.01
                    dones[agent.current_sub_id] = False
            if dones[agent.current_sub_id]:
                #print(f'Agent {agent.name} {agent.current_sub_id} finished {agent.current_mode}')
                obs[agent.name] = get_high_level_observation_from_game_state(
                    agent.last_game_state, self.agents.keys())
                infos[agent.name] = agent.score
                #rewards[agent.name] = 0#agent.aux_score
                dones[agent.name] = False
                #rewards[agent.name] -= np.average([v.aux_score for k, v in self.agents.items() if k != agent.name])

        for agent_name in action_dict.keys():
            if agent_name not in map(lambda a: a.name, self.active_agents):
                agent = self.agents[agent_name]
                agent.store_game_state(self.get_state_for_agent(agent))
                if agent.current_mode == "COLLECT":
                    self.agents_last_obs[
                        agent.
                        current_sub_id] = get_collect_observation_from_game_state(
                            agent.last_game_state)
                elif agent.current_mode == "KILL":
                    self.agents_last_obs[
                        agent.
                        current_sub_id] = get_kill_observation_from_game_state(
                            agent.last_game_state)
                elif agent.current_mode == "DESTROY":
                    self.agents_last_obs[
                        agent.
                        current_sub_id] = get_destroy_observation_from_game_state(
                            agent.last_game_state)
                self.agents_last_obs[
                    agent_name] = get_high_level_observation_from_game_state(
                        agent.last_game_state, self.agents.keys())

        d = dones.values()
        if any(d):
            for agent in self.agents.values():
                agent.penalty -= agent.high_level_aux_score
                agent.penalty += np.average([
                    v.high_level_aux_score for k, v in self.agents.items()
                    if k != agent.name
                ])
            for agent in self.agents.values():
                if agent.current_sub_id in dones and dones[
                        agent.current_sub_id]:
                    rewards[agent.name] = -agent.penalty
                    agent.penalty = 0
                if agent.dead and agent.high_level_aux_score > 0:
                    agent.last_task_successful = True
                agent.high_level_aux_score = 0
                '''
                if not agent.dead and dones[agent.current_sub_id]:
                    #print(f'{self.current_step} {agent.name}: Done {agent.current_mode}')
                    agent.penalty += np.average([v.high_level_aux_score for k, v in self.agents.items() if k != agent.name])
                    rewards[agent.name] = agent.high_level_aux_score - agent.penalty
                    #print(f'{self.current_step} {agent.name}: Reward received {rewards[agent.name]}')
                    #print(f'{self.current_step} {agent.name}: Penalty now 0')
                    agent.penalty = 0
                else:
                    agent.penalty += np.average([v.high_level_aux_score for k, v in self.agents.items() if k != agent.name])
                    #print(f'{self.current_step} {agent.name}: Penalty now {agent.penalty}')
                '''

        for agent in self.agents.values():
            agent.crates_destroyed = 0
            agent.aux_score = 0
            '''
            if not agent.dead:
                if dones[agent.current_sub_id]:
                    #print(f'{self.current_step} {agent.name}: Hl score now 0')
                    agent.high_level_aux_score = 0
            elif agent.high_level_aux_score > 0: # Died but made points
                for k, v in self.agents.items():
                    if k != agent.name:
                        v.penalty += agent.high_level_aux_score / 3.
                agent.penalty -= agent.high_level_aux_score
                agent.high_level_aux_score = 0
                agent.last_task_successful = True
            '''

        if self.done():
            self.end_round()
            dones['__all__'] = True
            # Determine winner and losers
            # winner can only contain a single agent with the highest score
            # loser contains agents without the highest score
            #winner, loser = self.get_winner_loser()
            for a in self.agents.values():
                #rewards[a.name] = 0
                # Add observation for agents that died ealier
                if a not in self.active_agents:
                    #rewards[a.name] = a.penalty
                    #a.store_game_state(self.get_state_for_agent(a))
                    #obs[a.name] = get_observation_from_game_state(a.last_game_state, self.agents.keys())
                    obs[a.name] = self.agents_last_obs[a.name]
                    obs[a.current_sub_id] = self.agents_last_obs[
                        a.current_sub_id]
                    rewards[
                        a.
                        current_sub_id] = 1. if a.last_task_successful else -1

                if a.name not in obs:
                    obs[a.name] = get_high_level_observation_from_game_state(
                        a.last_game_state, self.agents.keys())

                if a.name not in rewards:
                    rewards[a.name] = -a.penalty

                dones[a.current_sub_id] = True
                dones[a.name] = True
                #print(f'{self.current_step} {a.name}: Reward received {-a.penalty}')
                # Add rewards for all agents based on their final score
                #if a.name in winner:
                #rewards[a.name] = 3. / 3**(len(winner)-1)
                #rewards[a.name] = a.score - np.average([v.score for k, v in self.agents.items() if k != a.name])
                #elif a.name in loser:
                #    rewards[a.name] = -1.
                #else:
                #    rewards[a.name] = 0.
                infos[a.name] = a.score
        else:
            dones['__all__'] = False
        return obs, rewards, dones, infos