def step(self, action_dict: MultiAgentDict) -> Tuple[
        MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]:
        obs = {}
        rewards = {}
        dones = {}
        infos = {}

        self.current_step += 1

        # Determine which agent gets to act first
        random.shuffle(self.active_agents)

        for agent in self.active_agents:
            self.perform_agent_action(agent, self.available_actions[action_dict[agent.name]])

        # Update arena after handling actions from agents
        self.collect_coins()
        self.update_bombs()
        self.evaluate_explosions()

        # Set obs, reward, done, info for agents still alive
        # Agents that died during this step will get their next obs, reward, done, info later when the round finishes
        for agent in self.active_agents:
            rewards[agent.name] = agent.aux_score
            rewards[agent.name] -= np.average([v.aux_score for k, v in self.agents.items() if k != agent.name])
            agent.store_game_state(self.get_state_for_agent(agent))
            dones[agent.name] = False
            obs[agent.name] = get_kill_observation_from_game_state(agent.last_game_state)
            infos[agent.name] = agent.score

        for agent_name in action_dict.keys():
            if agent_name not in map(lambda a: a.name, self.active_agents):
                agent = self.agents[agent_name]
                agent.store_game_state(self.get_state_for_agent(agent))
                self.agents_last_obs[agent_name] = get_kill_observation_from_game_state(agent.last_game_state)

        for agent in self.agents.values():
            if agent.dead:
                agent.penalty += agent.aux_score
                agent.penalty -= np.average([v.aux_score for k, v in self.agents.items() if k != agent.name])

        for agent in self.agents.values():
            agent.aux_score = 0

        if self.done():
            self.end_round()
            dones['__all__'] = True
            for a in self.agents.values():
                # Add observation for agents that died ealier
                if a not in self.active_agents:
                    rewards[a.name] = a.penalty
                    obs[a.name] = self.agents_last_obs[a.name]
                dones[a.name] = True
                infos[a.name] = a.score
        else:
            dones['__all__'] = False
        return obs, rewards, dones, infos
Ejemplo n.º 2
0
    def step(
        self, action_dict: MultiAgentDict
    ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]:
        obs = {}
        rewards = {}
        dones = {}
        infos = {}

        self.current_step += 1

        # Determine which agent gets to act first
        random.shuffle(self.active_agents)

        for agent in self.active_agents:
            self.perform_agent_action(
                agent, self.available_actions[action_dict[agent.name]])

        # Update arena after handling actions from agents
        self.collect_coins()
        self.update_bombs()
        self.evaluate_explosions()

        # Set obs, reward, done, info for agents still alive
        # Agents that died during this step will get their next obs, reward, done, info later when the round finishes
        for agent in self.active_agents:
            rewards[agent.name] = agent.aux_score
            rewards[agent.name] -= np.average([
                v.aux_score for k, v in self.agents.items() if k != agent.name
            ])
            #agent.crates_destroyed = 0# Reset aux score
            agent.store_game_state(self.get_state_for_agent(agent))
            dones[agent.name] = False
            obs[agent.name] = get_observation_from_game_state(
                agent.last_game_state, self.agents.keys())
            infos[agent.name] = agent.score

        for agent_name in action_dict.keys():
            if agent_name not in map(lambda a: a.name, self.active_agents):
                agent = self.agents[agent_name]
                agent.store_game_state(self.get_state_for_agent(agent))
                self.agents_last_obs[
                    agent_name] = get_observation_from_game_state(
                        agent.last_game_state, self.agents.keys())

        for agent in self.agents.values():
            if agent.dead:
                agent.penalty += agent.aux_score
                agent.penalty -= np.average([
                    v.aux_score for k, v in self.agents.items()
                    if k != agent.name
                ])

        for agent in self.agents.values():
            agent.aux_score = 0

        if self.done():
            self.end_round()
            dones['__all__'] = True
            # Determine winner and losers
            # winner can only contain a single agent with the highest score
            # loser contains agents without the highest score
            winner, loser = self.get_winner_loser()
            for a in self.agents.values():
                #rewards[a.name] = 0
                # Add observation for agents that died ealier
                if a not in self.active_agents:
                    rewards[a.name] = a.penalty
                    #a.store_game_state(self.get_state_for_agent(a))
                    #obs[a.name] = get_observation_from_game_state(a.last_game_state, self.agents.keys())
                    obs[a.name] = self.agents_last_obs[a.name]
                # Add rewards for all agents based on their final score
                #if a.name in winner:
                #rewards[a.name] = 3. / 3**(len(winner)-1)
                #rewards[a.name] -= np.average([v.score + v.crates_destroyed * 0.05 for k, v in self.agents.items() if k != a.name])
                #elif a.name in loser:
                #    rewards[a.name] = -1.
                #else:
                #    rewards[a.name] = 0.
                dones[a.name] = True
                infos[a.name] = a.score
        else:
            dones['__all__'] = False
        return obs, rewards, dones, infos
Ejemplo n.º 3
0
    def step(
        self, action_dict: MultiAgentDict
    ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]:
        obs = {}
        rewards = {}
        dones = {}
        infos = {}

        self.current_step += 1

        # Determine which agent gets to act first
        random.shuffle(self.active_agents)

        for agent in self.active_agents:
            #print(f'Agent {agent.name} {agent.current_sub_id} - Action {LOW_LEVEL_ACTIONS[action_dict[agent.name]]}')
            self.perform_agent_action(
                agent, LOW_LEVEL_ACTIONS[action_dict[agent.name]])

        # Update arena after handling actions from agents
        self.collect_coins()
        self.update_bombs()
        self.evaluate_explosions()

        # Set obs, reward, done, info for agents still alive
        # Agents that died during this step will get their next obs, reward, done, info later when the round finishes
        for agent in self.active_agents:
            agent.store_game_state(self.get_state_for_agent(agent))
            agent.low_level_steps += 1
            if agent.current_mode == "COLLECT":
                obs[agent.
                    current_sub_id] = get_collect_observation_from_game_state(
                        agent.last_game_state)
                if agent.high_level_aux_score == 1:
                    rewards[agent.current_sub_id] = 1.0
                    dones[agent.current_sub_id] = True
                    agent.crates_destroyed = 0
                    agent.low_level_steps = 0
                elif agent.low_level_steps >= agent.max_low_level_steps:
                    dones[agent.current_sub_id] = True
                    rewards[agent.current_sub_id] = -1.0
                    agent.low_level_steps = 0
                else:
                    rewards[agent.current_sub_id] = -0.01
                    dones[agent.current_sub_id] = False
            elif agent.current_mode == "KILL":
                obs[agent.
                    current_sub_id] = get_kill_observation_from_game_state(
                        agent.last_game_state)
                if agent.high_level_aux_score >= 5:
                    rewards[agent.current_sub_id] = 1.0
                    dones[agent.current_sub_id] = True
                    agent.low_level_steps = 0
                elif agent.low_level_steps >= agent.max_low_level_steps:
                    dones[agent.current_sub_id] = True
                    rewards[agent.current_sub_id] = -1.0
                    agent.low_level_steps = 0
                else:
                    rewards[
                        agent.current_sub_id] = agent.crates_destroyed * 0.01
                    rewards[agent.current_sub_id] -= 0.01
                    dones[agent.current_sub_id] = False
            elif agent.current_mode == "DESTROY":
                obs[agent.
                    current_sub_id] = get_destroy_observation_from_game_state(
                        agent.last_game_state)
                if agent.crates_destroyed > 0:
                    rewards[agent.current_sub_id] = 1.0
                    dones[agent.current_sub_id] = True
                    agent.low_level_steps = 0
                elif agent.low_level_steps >= agent.max_low_level_steps:
                    dones[agent.current_sub_id] = True
                    rewards[agent.current_sub_id] = -1.0
                    agent.low_level_steps = 0
                else:
                    rewards[agent.current_sub_id] = -0.01
                    dones[agent.current_sub_id] = False
            if dones[agent.current_sub_id]:
                #print(f'Agent {agent.name} {agent.current_sub_id} finished {agent.current_mode}')
                obs[agent.name] = get_high_level_observation_from_game_state(
                    agent.last_game_state, self.agents.keys())
                infos[agent.name] = agent.score
                #rewards[agent.name] = 0#agent.aux_score
                dones[agent.name] = False
                #rewards[agent.name] -= np.average([v.aux_score for k, v in self.agents.items() if k != agent.name])

        for agent_name in action_dict.keys():
            if agent_name not in map(lambda a: a.name, self.active_agents):
                agent = self.agents[agent_name]
                agent.store_game_state(self.get_state_for_agent(agent))
                if agent.current_mode == "COLLECT":
                    self.agents_last_obs[
                        agent.
                        current_sub_id] = get_collect_observation_from_game_state(
                            agent.last_game_state)
                elif agent.current_mode == "KILL":
                    self.agents_last_obs[
                        agent.
                        current_sub_id] = get_kill_observation_from_game_state(
                            agent.last_game_state)
                elif agent.current_mode == "DESTROY":
                    self.agents_last_obs[
                        agent.
                        current_sub_id] = get_destroy_observation_from_game_state(
                            agent.last_game_state)
                self.agents_last_obs[
                    agent_name] = get_high_level_observation_from_game_state(
                        agent.last_game_state, self.agents.keys())

        d = dones.values()
        if any(d):
            for agent in self.agents.values():
                agent.penalty -= agent.high_level_aux_score
                agent.penalty += np.average([
                    v.high_level_aux_score for k, v in self.agents.items()
                    if k != agent.name
                ])
            for agent in self.agents.values():
                if agent.current_sub_id in dones and dones[
                        agent.current_sub_id]:
                    rewards[agent.name] = -agent.penalty
                    agent.penalty = 0
                if agent.dead and agent.high_level_aux_score > 0:
                    agent.last_task_successful = True
                agent.high_level_aux_score = 0
                '''
                if not agent.dead and dones[agent.current_sub_id]:
                    #print(f'{self.current_step} {agent.name}: Done {agent.current_mode}')
                    agent.penalty += np.average([v.high_level_aux_score for k, v in self.agents.items() if k != agent.name])
                    rewards[agent.name] = agent.high_level_aux_score - agent.penalty
                    #print(f'{self.current_step} {agent.name}: Reward received {rewards[agent.name]}')
                    #print(f'{self.current_step} {agent.name}: Penalty now 0')
                    agent.penalty = 0
                else:
                    agent.penalty += np.average([v.high_level_aux_score for k, v in self.agents.items() if k != agent.name])
                    #print(f'{self.current_step} {agent.name}: Penalty now {agent.penalty}')
                '''

        for agent in self.agents.values():
            agent.crates_destroyed = 0
            agent.aux_score = 0
            '''
            if not agent.dead:
                if dones[agent.current_sub_id]:
                    #print(f'{self.current_step} {agent.name}: Hl score now 0')
                    agent.high_level_aux_score = 0
            elif agent.high_level_aux_score > 0: # Died but made points
                for k, v in self.agents.items():
                    if k != agent.name:
                        v.penalty += agent.high_level_aux_score / 3.
                agent.penalty -= agent.high_level_aux_score
                agent.high_level_aux_score = 0
                agent.last_task_successful = True
            '''

        if self.done():
            self.end_round()
            dones['__all__'] = True
            # Determine winner and losers
            # winner can only contain a single agent with the highest score
            # loser contains agents without the highest score
            #winner, loser = self.get_winner_loser()
            for a in self.agents.values():
                #rewards[a.name] = 0
                # Add observation for agents that died ealier
                if a not in self.active_agents:
                    #rewards[a.name] = a.penalty
                    #a.store_game_state(self.get_state_for_agent(a))
                    #obs[a.name] = get_observation_from_game_state(a.last_game_state, self.agents.keys())
                    obs[a.name] = self.agents_last_obs[a.name]
                    obs[a.current_sub_id] = self.agents_last_obs[
                        a.current_sub_id]
                    rewards[
                        a.
                        current_sub_id] = 1. if a.last_task_successful else -1

                if a.name not in obs:
                    obs[a.name] = get_high_level_observation_from_game_state(
                        a.last_game_state, self.agents.keys())

                if a.name not in rewards:
                    rewards[a.name] = -a.penalty

                dones[a.current_sub_id] = True
                dones[a.name] = True
                #print(f'{self.current_step} {a.name}: Reward received {-a.penalty}')
                # Add rewards for all agents based on their final score
                #if a.name in winner:
                #rewards[a.name] = 3. / 3**(len(winner)-1)
                #rewards[a.name] = a.score - np.average([v.score for k, v in self.agents.items() if k != a.name])
                #elif a.name in loser:
                #    rewards[a.name] = -1.
                #else:
                #    rewards[a.name] = 0.
                infos[a.name] = a.score
        else:
            dones['__all__'] = False
        return obs, rewards, dones, infos