def log_returns(self, episode_id: str, reward_dict: MultiAgentDict, info_dict: MultiAgentDict = None, multiagent_done_dict: MultiAgentDict = None) -> None: """Record returns from the environment. The reward will be attributed to the previous action taken by the episode. Rewards accumulate until the next action. If no reward is logged before the next action, a reward of 0.0 is assumed. Args: episode_id: Episode id returned from start_episode(). reward_dict: Reward from the environment agents. info_dict: Optional info dict. multiagent_done_dict: Optional done dict for agents. """ episode = self._get(episode_id) # Accumulate reward by agent. # For existing agents, we want to add the reward up. for agent, rew in reward_dict.items(): if agent in episode.cur_reward_dict: episode.cur_reward_dict[agent] += rew else: episode.cur_reward_dict[agent] = rew if multiagent_done_dict: for agent, done in multiagent_done_dict.items(): episode.cur_done_dict[agent] = done if info_dict: episode.cur_info_dict = info_dict or {}
def observe(self, obs: MultiAgentDict, rewards: MultiAgentDict, dones: MultiAgentDict, infos: MultiAgentDict): self.last_obs = obs for ag, r in rewards.items(): if ag in self.last_rewards: self.last_rewards[ag] += r else: self.last_rewards[ag] = r for ag, d in dones.items(): if ag in self.last_dones: self.last_dones[ag] = self.last_dones[ag] or d else: self.last_dones[ag] = d self.last_infos = infos
def step( self, action_dict: MultiAgentDict ) -> Tuple[dict, dict, Dict[Union[str, Any], Union[bool, Any]], dict]: hunters = [] n = 0 #print(len(self.agents)) observation, reward, done, reproduce = {}, {}, {}, {} alive = [] #print(len(action_dict), action_dict) for i, action in action_dict.items(): if not i in self.dones: observation[i], reward[i], done[i], reproduce[i] = self.agents[ i].step(action) if done[i]: self.dones.append(i) alive.append(i) for i in alive: # print("len", observation, action_dict[0], reward) if not i in self.dones: if reproduce[i]: new_hunter = HunterEnv() observation[len(self.agents)] = new_hunter.reset() reward[len(self.agents)] = 0 done[len(self.agents)] = False reproduce[len(self.agents)] = False self.agents.append(new_hunter) done["__all__"] = len(self.dones) == len(self.agents) #print(observation) self.alive = len(observation) return observation, reward, done, reproduce
def observation_space_contains(self, x: MultiAgentDict) -> bool: """Checks if the observation space contains the given key. Args: x: Observations to check. Returns: True if the observation space contains the given all observations in x. """ if (not hasattr(self, "_spaces_in_preferred_format") or self._spaces_in_preferred_format is None): self._spaces_in_preferred_format = ( self._check_if_space_maps_agent_id_to_sub_space()) if self._spaces_in_preferred_format: for key, agent_obs in x.items(): if not self.observation_space[key].contains(agent_obs): return False if not all(k in self.observation_space for k in x): if log_once( "possibly_bad_multi_agent_dict_missing_agent_observations" ): logger.warning( "You environment returns observations that are " "MultiAgentDicts with incomplete information. " "Meaning that they only contain information on a subset of" " participating agents. Ignore this warning if this is " "intended, for example if your environment is a turn-based " "simulation.") return True logger.warning("observation_space_contains() has not been implemented") return True
def step( self, action_dict: MultiAgentDict ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]: processed_action_dict = {} for agent_id, action in action_dict.items(): if agent_id in self.agents_to_action_converters: convertor: RestrictedToBaseGameActionSpaceConverter = self.agents_to_action_converters[ agent_id] base_game_action, _, _ = convertor.get_base_game_action( obs=self._agents_to_current_obs[agent_id], restricted_game_action=action, use_delegate_policy_exploration=self. _use_delegate_policy_exploration, clip_base_game_actions=self._clip_base_game_actions, delegate_policy_state=None) processed_action_dict[agent_id] = base_game_action else: processed_action_dict[agent_id] = action obs, rews, dones, infos = self.base_env.step( action_dict=processed_action_dict) for agent_id, observation in obs.items(): self._agents_to_current_obs[agent_id] = observation return obs, rews, dones, infos
def step(self, action_dict: MultiAgentDict) -> Tuple[ MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]: obs = {} rewards = {} dones = {} infos = {} self.current_step += 1 # Determine which agent gets to act first random.shuffle(self.active_agents) for agent in self.active_agents: self.perform_agent_action(agent, self.available_actions[action_dict[agent.name]]) # Update arena after handling actions from agents self.collect_coins() self.update_bombs() self.evaluate_explosions() # Set obs, reward, done, info for agents still alive # Agents that died during this step will get their next obs, reward, done, info later when the round finishes for agent in self.active_agents: rewards[agent.name] = agent.aux_score rewards[agent.name] -= np.average([v.aux_score for k, v in self.agents.items() if k != agent.name]) agent.store_game_state(self.get_state_for_agent(agent)) dones[agent.name] = False obs[agent.name] = get_kill_observation_from_game_state(agent.last_game_state) infos[agent.name] = agent.score for agent_name in action_dict.keys(): if agent_name not in map(lambda a: a.name, self.active_agents): agent = self.agents[agent_name] agent.store_game_state(self.get_state_for_agent(agent)) self.agents_last_obs[agent_name] = get_kill_observation_from_game_state(agent.last_game_state) for agent in self.agents.values(): if agent.dead: agent.penalty += agent.aux_score agent.penalty -= np.average([v.aux_score for k, v in self.agents.items() if k != agent.name]) for agent in self.agents.values(): agent.aux_score = 0 if self.done(): self.end_round() dones['__all__'] = True for a in self.agents.values(): # Add observation for agents that died ealier if a not in self.active_agents: rewards[a.name] = a.penalty obs[a.name] = self.agents_last_obs[a.name] dones[a.name] = True infos[a.name] = a.score else: dones['__all__'] = False return obs, rewards, dones, infos
def step(self, action_dict: MultiAgentDict) -> Tuple[ MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]: obs = {} rewards = {} dones = {} infos = {} for agent_name, action in action_dict.items(): if agent_name.endswith('_high'): agent = self.flat_env.agents[agent_name] action_name = HIGH_LEVEL_ACTIONS[action] agent.high_level_steps += 1 agent_id = f'{agent.low_level_prefix}{action_name}_{agent_name}_{agent.high_level_steps}' if action_name == 'COLLECT': obs.update({agent_id : get_collect_observation_from_game_state(agent.last_game_state)}) elif action_name == 'DESTROY': obs.update({agent_id : get_destroy_observation_from_game_state(agent.last_game_state)}) elif action_name == 'KILL': obs.update({agent_id : get_kill_observation_from_game_state(agent.last_game_state)}) else: raise Exception() rewards.update({agent_id: 0}) dones.update({agent_id: False }) self.high_low_mapping[agent_name] = agent_id agent.current_mode = action_name agent.current_sub_id = agent_id #print(f'Agent {agent_name} now {action_name}') else: #agent_1_high #agent_1_low_1 #agent_2_low_5 agent_parts = agent_name.split('_') high_level_agent_name = f'{agent_parts[2]}_{agent_parts[3]}_high' self.action_buffer[high_level_agent_name] = action #print(f'Add to buffer: Agent {high_level_agent_name} - Action {action}') #agent = self.flat_env.agents[high_level_agent_name] if len(self.action_buffer) == len(self.flat_env.active_agents): obs, rewards, dones, infos = self.flat_env.step(self.action_buffer) self.action_buffer = {} pass else: dones.update({'__all__' : False}) return obs, rewards, dones, infos
def step( self, action_dict: MultiAgentDict ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]: for agent_id, action in action_dict.items(): if self._agents_to_current_valid_actions_mask[ agent_id] is not None: assert self._agents_to_current_valid_actions_mask[agent_id][action] == 1.0, f"\nagent is {agent_id} " \ f"action is {action}" \ f"rstr valid_actions are {self._agents_to_current_valid_actions_mask[agent_id]}" base_obs_dict, rews, dones, infos = self.base_env.step( action_dict=action_dict) restricted_game_obs = self._convert_obs_to_restricted_game( base_game_obs_dict=base_obs_dict, dones=dones) return restricted_game_obs, rews, dones, infos
def _convert_obs_to_restricted_game(self, base_game_obs_dict: MultiAgentDict, dones): obs_dict_out = {} self._agents_to_current_valid_actions_mask = { agent: None for agent in range(2) } for agent_id, base_game_obs in base_game_obs_dict.items(): if agent_id in self.agent_conversions: if not dones["__all__"]: base_game_obs_as_tuple = tuple(base_game_obs) try: restricted_game_obs = self.agent_conversions[ agent_id].orig_obs_to_restricted_game_obs[ base_game_obs_as_tuple] # assert len(restricted_game_obs) == 90, "only needs to be true for 20x dummy leduc" except KeyError: assert isinstance(base_game_obs_as_tuple, tuple) assert base_game_obs_as_tuple[0] == \ list(self.agent_conversions[agent_id].orig_obs_to_restricted_game_obs.keys())[0][ 0], f"key provided is {base_game_obs_as_tuple}\n agent id is {agent_id} \n example key is {list(self.agent_conversions[agent_id].orig_obs_to_restricted_game_obs.keys())[0]}" assert len(base_game_obs_as_tuple) == len( list(self.agent_conversions[agent_id]. orig_obs_to_restricted_game_obs.keys())[0] ), f"{len(base_game_obs_as_tuple)} {len(list(self.agent_conversions[agent_id].orig_obs_to_restricted_game_obs.keys())[0])}" print( f"keys are: {self.agent_conversions[agent_id].orig_obs_to_restricted_game_obs.keys()}\n\nlooking for {base_game_obs_as_tuple}" ) raise self._agents_to_current_valid_actions_mask[agent_id] = \ self.agent_conversions[agent_id].orig_obs_to_restricted_game_valid_actions_mask[ base_game_obs_as_tuple] obs_dict_out[agent_id] = restricted_game_obs else: restricted_game_obs = np.zeros( shape=self.observation_space.shape, dtype=np.float32) restricted_game_obs[:len(base_game_obs)] = base_game_obs obs_dict_out[agent_id] = restricted_game_obs else: obs_dict_out[agent_id] = base_game_obs return obs_dict_out
def step( self, action_dict: MultiAgentDict ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]: #print(len(self.agents)) observation, reward, done, reproduce = {}, {}, {}, {} alive = [] #print(len(action_dict), action_dict) for id, action in action_dict.items(): #i = int(id.split('_')[1]) if not id in self.dones: observation[id], reward[id], done[id], reproduce[ id] = self.agents[id].step(action) if done[id]: self.dones.append(id) alive.append(id) # else: # observation[id], reward[id], done[id], reproduce[id] = self.prey_agents[id].step(action) # if done[id]: # self.dones.append(id) # alive.append(id) for id in alive: # print("len", observation, action_dict[0], reward) if not id in self.dones: if reproduce[id]: if "hunter" in id: self.hunter_count += 1 new_agent = HunterEnv() new_id = "hunter_" + str(self.hunter_count) else: self.prey_count += 1 new_agent = PreyEnv() new_id = "prey_" + str(self.prey_count) observation[new_id] = new_agent.reset() reward[new_id] = 0 done[new_id] = False reproduce[new_id] = False self.agents[new_id] = new_agent done["__all__"] = len(self.dones) == len(self.agents) #print(observation) self.alive = len(observation) return observation, reward, done, reproduce
def observation_space_contains(self, x: MultiAgentDict) -> bool: if not isinstance(x, dict): return False return all( self.observation_space.contains(val) for val in x.values())
def step( self, action_dict: MultiAgentDict ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]: obs = {} rewards = {} dones = {} infos = {} self.current_step += 1 # Determine which agent gets to act first random.shuffle(self.active_agents) for agent in self.active_agents: self.perform_agent_action( agent, self.available_actions[action_dict[agent.name]]) # Update arena after handling actions from agents self.collect_coins() self.update_bombs() self.evaluate_explosions() # Set obs, reward, done, info for agents still alive # Agents that died during this step will get their next obs, reward, done, info later when the round finishes for agent in self.active_agents: rewards[agent.name] = agent.aux_score rewards[agent.name] -= np.average([ v.aux_score for k, v in self.agents.items() if k != agent.name ]) #agent.crates_destroyed = 0# Reset aux score agent.store_game_state(self.get_state_for_agent(agent)) dones[agent.name] = False obs[agent.name] = get_observation_from_game_state( agent.last_game_state, self.agents.keys()) infos[agent.name] = agent.score for agent_name in action_dict.keys(): if agent_name not in map(lambda a: a.name, self.active_agents): agent = self.agents[agent_name] agent.store_game_state(self.get_state_for_agent(agent)) self.agents_last_obs[ agent_name] = get_observation_from_game_state( agent.last_game_state, self.agents.keys()) for agent in self.agents.values(): if agent.dead: agent.penalty += agent.aux_score agent.penalty -= np.average([ v.aux_score for k, v in self.agents.items() if k != agent.name ]) for agent in self.agents.values(): agent.aux_score = 0 if self.done(): self.end_round() dones['__all__'] = True # Determine winner and losers # winner can only contain a single agent with the highest score # loser contains agents without the highest score winner, loser = self.get_winner_loser() for a in self.agents.values(): #rewards[a.name] = 0 # Add observation for agents that died ealier if a not in self.active_agents: rewards[a.name] = a.penalty #a.store_game_state(self.get_state_for_agent(a)) #obs[a.name] = get_observation_from_game_state(a.last_game_state, self.agents.keys()) obs[a.name] = self.agents_last_obs[a.name] # Add rewards for all agents based on their final score #if a.name in winner: #rewards[a.name] = 3. / 3**(len(winner)-1) #rewards[a.name] -= np.average([v.score + v.crates_destroyed * 0.05 for k, v in self.agents.items() if k != a.name]) #elif a.name in loser: # rewards[a.name] = -1. #else: # rewards[a.name] = 0. dones[a.name] = True infos[a.name] = a.score else: dones['__all__'] = False return obs, rewards, dones, infos
def step( self, action_dict: MultiAgentDict ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]: obs = {} rewards = {} dones = {} infos = {} self.current_step += 1 # Determine which agent gets to act first random.shuffle(self.active_agents) for agent in self.active_agents: #print(f'Agent {agent.name} {agent.current_sub_id} - Action {LOW_LEVEL_ACTIONS[action_dict[agent.name]]}') self.perform_agent_action( agent, LOW_LEVEL_ACTIONS[action_dict[agent.name]]) # Update arena after handling actions from agents self.collect_coins() self.update_bombs() self.evaluate_explosions() # Set obs, reward, done, info for agents still alive # Agents that died during this step will get their next obs, reward, done, info later when the round finishes for agent in self.active_agents: agent.store_game_state(self.get_state_for_agent(agent)) agent.low_level_steps += 1 if agent.current_mode == "COLLECT": obs[agent. current_sub_id] = get_collect_observation_from_game_state( agent.last_game_state) if agent.high_level_aux_score == 1: rewards[agent.current_sub_id] = 1.0 dones[agent.current_sub_id] = True agent.crates_destroyed = 0 agent.low_level_steps = 0 elif agent.low_level_steps >= agent.max_low_level_steps: dones[agent.current_sub_id] = True rewards[agent.current_sub_id] = -1.0 agent.low_level_steps = 0 else: rewards[agent.current_sub_id] = -0.01 dones[agent.current_sub_id] = False elif agent.current_mode == "KILL": obs[agent. current_sub_id] = get_kill_observation_from_game_state( agent.last_game_state) if agent.high_level_aux_score >= 5: rewards[agent.current_sub_id] = 1.0 dones[agent.current_sub_id] = True agent.low_level_steps = 0 elif agent.low_level_steps >= agent.max_low_level_steps: dones[agent.current_sub_id] = True rewards[agent.current_sub_id] = -1.0 agent.low_level_steps = 0 else: rewards[ agent.current_sub_id] = agent.crates_destroyed * 0.01 rewards[agent.current_sub_id] -= 0.01 dones[agent.current_sub_id] = False elif agent.current_mode == "DESTROY": obs[agent. current_sub_id] = get_destroy_observation_from_game_state( agent.last_game_state) if agent.crates_destroyed > 0: rewards[agent.current_sub_id] = 1.0 dones[agent.current_sub_id] = True agent.low_level_steps = 0 elif agent.low_level_steps >= agent.max_low_level_steps: dones[agent.current_sub_id] = True rewards[agent.current_sub_id] = -1.0 agent.low_level_steps = 0 else: rewards[agent.current_sub_id] = -0.01 dones[agent.current_sub_id] = False if dones[agent.current_sub_id]: #print(f'Agent {agent.name} {agent.current_sub_id} finished {agent.current_mode}') obs[agent.name] = get_high_level_observation_from_game_state( agent.last_game_state, self.agents.keys()) infos[agent.name] = agent.score #rewards[agent.name] = 0#agent.aux_score dones[agent.name] = False #rewards[agent.name] -= np.average([v.aux_score for k, v in self.agents.items() if k != agent.name]) for agent_name in action_dict.keys(): if agent_name not in map(lambda a: a.name, self.active_agents): agent = self.agents[agent_name] agent.store_game_state(self.get_state_for_agent(agent)) if agent.current_mode == "COLLECT": self.agents_last_obs[ agent. current_sub_id] = get_collect_observation_from_game_state( agent.last_game_state) elif agent.current_mode == "KILL": self.agents_last_obs[ agent. current_sub_id] = get_kill_observation_from_game_state( agent.last_game_state) elif agent.current_mode == "DESTROY": self.agents_last_obs[ agent. current_sub_id] = get_destroy_observation_from_game_state( agent.last_game_state) self.agents_last_obs[ agent_name] = get_high_level_observation_from_game_state( agent.last_game_state, self.agents.keys()) d = dones.values() if any(d): for agent in self.agents.values(): agent.penalty -= agent.high_level_aux_score agent.penalty += np.average([ v.high_level_aux_score for k, v in self.agents.items() if k != agent.name ]) for agent in self.agents.values(): if agent.current_sub_id in dones and dones[ agent.current_sub_id]: rewards[agent.name] = -agent.penalty agent.penalty = 0 if agent.dead and agent.high_level_aux_score > 0: agent.last_task_successful = True agent.high_level_aux_score = 0 ''' if not agent.dead and dones[agent.current_sub_id]: #print(f'{self.current_step} {agent.name}: Done {agent.current_mode}') agent.penalty += np.average([v.high_level_aux_score for k, v in self.agents.items() if k != agent.name]) rewards[agent.name] = agent.high_level_aux_score - agent.penalty #print(f'{self.current_step} {agent.name}: Reward received {rewards[agent.name]}') #print(f'{self.current_step} {agent.name}: Penalty now 0') agent.penalty = 0 else: agent.penalty += np.average([v.high_level_aux_score for k, v in self.agents.items() if k != agent.name]) #print(f'{self.current_step} {agent.name}: Penalty now {agent.penalty}') ''' for agent in self.agents.values(): agent.crates_destroyed = 0 agent.aux_score = 0 ''' if not agent.dead: if dones[agent.current_sub_id]: #print(f'{self.current_step} {agent.name}: Hl score now 0') agent.high_level_aux_score = 0 elif agent.high_level_aux_score > 0: # Died but made points for k, v in self.agents.items(): if k != agent.name: v.penalty += agent.high_level_aux_score / 3. agent.penalty -= agent.high_level_aux_score agent.high_level_aux_score = 0 agent.last_task_successful = True ''' if self.done(): self.end_round() dones['__all__'] = True # Determine winner and losers # winner can only contain a single agent with the highest score # loser contains agents without the highest score #winner, loser = self.get_winner_loser() for a in self.agents.values(): #rewards[a.name] = 0 # Add observation for agents that died ealier if a not in self.active_agents: #rewards[a.name] = a.penalty #a.store_game_state(self.get_state_for_agent(a)) #obs[a.name] = get_observation_from_game_state(a.last_game_state, self.agents.keys()) obs[a.name] = self.agents_last_obs[a.name] obs[a.current_sub_id] = self.agents_last_obs[ a.current_sub_id] rewards[ a. current_sub_id] = 1. if a.last_task_successful else -1 if a.name not in obs: obs[a.name] = get_high_level_observation_from_game_state( a.last_game_state, self.agents.keys()) if a.name not in rewards: rewards[a.name] = -a.penalty dones[a.current_sub_id] = True dones[a.name] = True #print(f'{self.current_step} {a.name}: Reward received {-a.penalty}') # Add rewards for all agents based on their final score #if a.name in winner: #rewards[a.name] = 3. / 3**(len(winner)-1) #rewards[a.name] = a.score - np.average([v.score for k, v in self.agents.items() if k != a.name]) #elif a.name in loser: # rewards[a.name] = -1. #else: # rewards[a.name] = 0. infos[a.name] = a.score else: dones['__all__'] = False return obs, rewards, dones, infos