def step(self, action_dict: MultiAgentDict) -> Tuple[ MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]: obs = {} rewards = {} dones = {} infos = {} self.current_step += 1 # Determine which agent gets to act first random.shuffle(self.active_agents) for agent in self.active_agents: self.perform_agent_action(agent, self.available_actions[action_dict[agent.name]]) # Update arena after handling actions from agents self.collect_coins() self.update_bombs() self.evaluate_explosions() # Set obs, reward, done, info for agents still alive # Agents that died during this step will get their next obs, reward, done, info later when the round finishes for agent in self.active_agents: rewards[agent.name] = agent.aux_score rewards[agent.name] -= np.average([v.aux_score for k, v in self.agents.items() if k != agent.name]) agent.store_game_state(self.get_state_for_agent(agent)) dones[agent.name] = False obs[agent.name] = get_kill_observation_from_game_state(agent.last_game_state) infos[agent.name] = agent.score for agent_name in action_dict.keys(): if agent_name not in map(lambda a: a.name, self.active_agents): agent = self.agents[agent_name] agent.store_game_state(self.get_state_for_agent(agent)) self.agents_last_obs[agent_name] = get_kill_observation_from_game_state(agent.last_game_state) for agent in self.agents.values(): if agent.dead: agent.penalty += agent.aux_score agent.penalty -= np.average([v.aux_score for k, v in self.agents.items() if k != agent.name]) for agent in self.agents.values(): agent.aux_score = 0 if self.done(): self.end_round() dones['__all__'] = True for a in self.agents.values(): # Add observation for agents that died ealier if a not in self.active_agents: rewards[a.name] = a.penalty obs[a.name] = self.agents_last_obs[a.name] dones[a.name] = True infos[a.name] = a.score else: dones['__all__'] = False return obs, rewards, dones, infos
def step( self, action_dict: MultiAgentDict ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]: obs = {} rewards = {} dones = {} infos = {} self.current_step += 1 # Determine which agent gets to act first random.shuffle(self.active_agents) for agent in self.active_agents: self.perform_agent_action( agent, self.available_actions[action_dict[agent.name]]) # Update arena after handling actions from agents self.collect_coins() self.update_bombs() self.evaluate_explosions() # Set obs, reward, done, info for agents still alive # Agents that died during this step will get their next obs, reward, done, info later when the round finishes for agent in self.active_agents: rewards[agent.name] = agent.aux_score rewards[agent.name] -= np.average([ v.aux_score for k, v in self.agents.items() if k != agent.name ]) #agent.crates_destroyed = 0# Reset aux score agent.store_game_state(self.get_state_for_agent(agent)) dones[agent.name] = False obs[agent.name] = get_observation_from_game_state( agent.last_game_state, self.agents.keys()) infos[agent.name] = agent.score for agent_name in action_dict.keys(): if agent_name not in map(lambda a: a.name, self.active_agents): agent = self.agents[agent_name] agent.store_game_state(self.get_state_for_agent(agent)) self.agents_last_obs[ agent_name] = get_observation_from_game_state( agent.last_game_state, self.agents.keys()) for agent in self.agents.values(): if agent.dead: agent.penalty += agent.aux_score agent.penalty -= np.average([ v.aux_score for k, v in self.agents.items() if k != agent.name ]) for agent in self.agents.values(): agent.aux_score = 0 if self.done(): self.end_round() dones['__all__'] = True # Determine winner and losers # winner can only contain a single agent with the highest score # loser contains agents without the highest score winner, loser = self.get_winner_loser() for a in self.agents.values(): #rewards[a.name] = 0 # Add observation for agents that died ealier if a not in self.active_agents: rewards[a.name] = a.penalty #a.store_game_state(self.get_state_for_agent(a)) #obs[a.name] = get_observation_from_game_state(a.last_game_state, self.agents.keys()) obs[a.name] = self.agents_last_obs[a.name] # Add rewards for all agents based on their final score #if a.name in winner: #rewards[a.name] = 3. / 3**(len(winner)-1) #rewards[a.name] -= np.average([v.score + v.crates_destroyed * 0.05 for k, v in self.agents.items() if k != a.name]) #elif a.name in loser: # rewards[a.name] = -1. #else: # rewards[a.name] = 0. dones[a.name] = True infos[a.name] = a.score else: dones['__all__'] = False return obs, rewards, dones, infos
def step( self, action_dict: MultiAgentDict ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]: obs = {} rewards = {} dones = {} infos = {} self.current_step += 1 # Determine which agent gets to act first random.shuffle(self.active_agents) for agent in self.active_agents: #print(f'Agent {agent.name} {agent.current_sub_id} - Action {LOW_LEVEL_ACTIONS[action_dict[agent.name]]}') self.perform_agent_action( agent, LOW_LEVEL_ACTIONS[action_dict[agent.name]]) # Update arena after handling actions from agents self.collect_coins() self.update_bombs() self.evaluate_explosions() # Set obs, reward, done, info for agents still alive # Agents that died during this step will get their next obs, reward, done, info later when the round finishes for agent in self.active_agents: agent.store_game_state(self.get_state_for_agent(agent)) agent.low_level_steps += 1 if agent.current_mode == "COLLECT": obs[agent. current_sub_id] = get_collect_observation_from_game_state( agent.last_game_state) if agent.high_level_aux_score == 1: rewards[agent.current_sub_id] = 1.0 dones[agent.current_sub_id] = True agent.crates_destroyed = 0 agent.low_level_steps = 0 elif agent.low_level_steps >= agent.max_low_level_steps: dones[agent.current_sub_id] = True rewards[agent.current_sub_id] = -1.0 agent.low_level_steps = 0 else: rewards[agent.current_sub_id] = -0.01 dones[agent.current_sub_id] = False elif agent.current_mode == "KILL": obs[agent. current_sub_id] = get_kill_observation_from_game_state( agent.last_game_state) if agent.high_level_aux_score >= 5: rewards[agent.current_sub_id] = 1.0 dones[agent.current_sub_id] = True agent.low_level_steps = 0 elif agent.low_level_steps >= agent.max_low_level_steps: dones[agent.current_sub_id] = True rewards[agent.current_sub_id] = -1.0 agent.low_level_steps = 0 else: rewards[ agent.current_sub_id] = agent.crates_destroyed * 0.01 rewards[agent.current_sub_id] -= 0.01 dones[agent.current_sub_id] = False elif agent.current_mode == "DESTROY": obs[agent. current_sub_id] = get_destroy_observation_from_game_state( agent.last_game_state) if agent.crates_destroyed > 0: rewards[agent.current_sub_id] = 1.0 dones[agent.current_sub_id] = True agent.low_level_steps = 0 elif agent.low_level_steps >= agent.max_low_level_steps: dones[agent.current_sub_id] = True rewards[agent.current_sub_id] = -1.0 agent.low_level_steps = 0 else: rewards[agent.current_sub_id] = -0.01 dones[agent.current_sub_id] = False if dones[agent.current_sub_id]: #print(f'Agent {agent.name} {agent.current_sub_id} finished {agent.current_mode}') obs[agent.name] = get_high_level_observation_from_game_state( agent.last_game_state, self.agents.keys()) infos[agent.name] = agent.score #rewards[agent.name] = 0#agent.aux_score dones[agent.name] = False #rewards[agent.name] -= np.average([v.aux_score for k, v in self.agents.items() if k != agent.name]) for agent_name in action_dict.keys(): if agent_name not in map(lambda a: a.name, self.active_agents): agent = self.agents[agent_name] agent.store_game_state(self.get_state_for_agent(agent)) if agent.current_mode == "COLLECT": self.agents_last_obs[ agent. current_sub_id] = get_collect_observation_from_game_state( agent.last_game_state) elif agent.current_mode == "KILL": self.agents_last_obs[ agent. current_sub_id] = get_kill_observation_from_game_state( agent.last_game_state) elif agent.current_mode == "DESTROY": self.agents_last_obs[ agent. current_sub_id] = get_destroy_observation_from_game_state( agent.last_game_state) self.agents_last_obs[ agent_name] = get_high_level_observation_from_game_state( agent.last_game_state, self.agents.keys()) d = dones.values() if any(d): for agent in self.agents.values(): agent.penalty -= agent.high_level_aux_score agent.penalty += np.average([ v.high_level_aux_score for k, v in self.agents.items() if k != agent.name ]) for agent in self.agents.values(): if agent.current_sub_id in dones and dones[ agent.current_sub_id]: rewards[agent.name] = -agent.penalty agent.penalty = 0 if agent.dead and agent.high_level_aux_score > 0: agent.last_task_successful = True agent.high_level_aux_score = 0 ''' if not agent.dead and dones[agent.current_sub_id]: #print(f'{self.current_step} {agent.name}: Done {agent.current_mode}') agent.penalty += np.average([v.high_level_aux_score for k, v in self.agents.items() if k != agent.name]) rewards[agent.name] = agent.high_level_aux_score - agent.penalty #print(f'{self.current_step} {agent.name}: Reward received {rewards[agent.name]}') #print(f'{self.current_step} {agent.name}: Penalty now 0') agent.penalty = 0 else: agent.penalty += np.average([v.high_level_aux_score for k, v in self.agents.items() if k != agent.name]) #print(f'{self.current_step} {agent.name}: Penalty now {agent.penalty}') ''' for agent in self.agents.values(): agent.crates_destroyed = 0 agent.aux_score = 0 ''' if not agent.dead: if dones[agent.current_sub_id]: #print(f'{self.current_step} {agent.name}: Hl score now 0') agent.high_level_aux_score = 0 elif agent.high_level_aux_score > 0: # Died but made points for k, v in self.agents.items(): if k != agent.name: v.penalty += agent.high_level_aux_score / 3. agent.penalty -= agent.high_level_aux_score agent.high_level_aux_score = 0 agent.last_task_successful = True ''' if self.done(): self.end_round() dones['__all__'] = True # Determine winner and losers # winner can only contain a single agent with the highest score # loser contains agents without the highest score #winner, loser = self.get_winner_loser() for a in self.agents.values(): #rewards[a.name] = 0 # Add observation for agents that died ealier if a not in self.active_agents: #rewards[a.name] = a.penalty #a.store_game_state(self.get_state_for_agent(a)) #obs[a.name] = get_observation_from_game_state(a.last_game_state, self.agents.keys()) obs[a.name] = self.agents_last_obs[a.name] obs[a.current_sub_id] = self.agents_last_obs[ a.current_sub_id] rewards[ a. current_sub_id] = 1. if a.last_task_successful else -1 if a.name not in obs: obs[a.name] = get_high_level_observation_from_game_state( a.last_game_state, self.agents.keys()) if a.name not in rewards: rewards[a.name] = -a.penalty dones[a.current_sub_id] = True dones[a.name] = True #print(f'{self.current_step} {a.name}: Reward received {-a.penalty}') # Add rewards for all agents based on their final score #if a.name in winner: #rewards[a.name] = 3. / 3**(len(winner)-1) #rewards[a.name] = a.score - np.average([v.score for k, v in self.agents.items() if k != a.name]) #elif a.name in loser: # rewards[a.name] = -1. #else: # rewards[a.name] = 0. infos[a.name] = a.score else: dones['__all__'] = False return obs, rewards, dones, infos