class MultiAgent(): def __init__(self, num_agents, state_size, action_size): self.agents = [] for i in range(num_agents): self.agents.append(Agent(state_size, action_size)) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE) def step(self, states, actions, rewards, next_states, done): for i in range(len(states)): self.memory.add(states[i], actions[i], rewards[i], next_states[i], done) if len(self.memory) > BATCH_SIZE: experiences, indexes = self.memory.sample() for agent in self.agents: error = agent.learn(experiences, GAMMA) #update priority replay memory self.memory.update(indexes, abs(error)) def act(self, states, add_noise=True, noise_weight=1.0): actions = [] for i in range(len(self.agents)): actions.append(self.agents[i].act(states[i], add_noise, noise_weight)) return actions def reset(self): for agent in self.agents: agent.reset()
class maddpg: """Wrapper class managing different agents in the environment.""" def __init__(self, num_agents=2, state_size=24, action_size=2): """Initialize a maddpg_agent wrapper. Params ====== num_agents (int): the number of agents in the environment state_size (int): dimension of each state action_size (int): dimension of each action """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.agents = [ ddpg(state_size, action_size, i + 1, random_seed=0) for i in range(num_agents) ] # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed=0) def reset(self): """Resets OU Noise for each agent.""" for agent in self.agents: agent.reset() def act(self, observations, add_noise=False): """Picks an action for each agent given.""" actions = [] for agent, observation in zip(self.agents, observations): action = agent.act(observation, add_noise=add_noise) actions.append(action) return np.array(actions) def step(self, states, actions, rewards, next_states, dones, timestep): """Save experience in replay memory.""" states = states.reshape(1, -1) actions = actions.reshape(1, -1) next_states = next_states.reshape(1, -1) self.memory.add(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and timestep % LEARNING_PERIOD == 0: for a_i, agent in enumerate(self.agents): experiences = self.memory.sample() self.learn(experiences, a_i) def learn(self, experiences, agent_number): """ The critic takes as its input the combined observations and actions from all agents. Collect actions from each agent for the 'experiences'. """ next_actions = [] actions_pred = [] states, _, _, next_states, _ = experiences next_states = next_states.reshape(-1, self.num_agents, self.state_size) states = states.reshape(-1, self.num_agents, self.state_size) for a_i, agent in enumerate(self.agents): agent_id_tensor = self._get_agent_number(a_i) state = states.index_select(1, agent_id_tensor).squeeze(1) next_state = next_states.index_select(1, agent_id_tensor).squeeze(1) next_actions.append(agent.actor_target(next_state)) actions_pred.append(agent.actor_local(state)) next_actions = torch.cat(next_actions, dim=1).to(device) actions_pred = torch.cat(actions_pred, dim=1).to(device) agent = self.agents[agent_number] agent.learn(experiences, next_actions, actions_pred) def _get_agent_number(self, i): """Helper to get an agent's number as a Torch tensor.""" return torch.tensor([i]).to(device) def save_weights(self, dir): for i in range(self.num_agents): torch.save(self.agents[i].actor_local.state_dict(), os.path.join(dir, 'checkpoint_actor_{}.pth'.format(i))) torch.save(self.agents[i].critic_local.state_dict(), os.path.join(dir, 'checkpoint_critic_{}.pth'.format(i)))