class MAgent(): def __init__(self, state_size, action_size, num_agents, random_seed, shared_replay_buffer): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.shared_replay_buffer = shared_replay_buffer self.t_step = 0 if shared_replay_buffer: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) shared_memory = self.memory else: shared_memory = None self.memory = None print("ma shared_memory -> ", shared_memory) self.ddpg_agents = [ Agent(state_size, action_size, random_seed, shared_memory) for _ in range(num_agents) ] # print("MAgent: number of agents: ->", num_agents) # print("Enter into ddpg Agent") def reset(self): for agent in self.ddpg_agents: agent.reset() def act(self, all_states): """get actions from all agents in the MADDPG object""" actions = [ agent.act(np.expand_dims(states, axis=0)) for agent, states in zip(self.ddpg_agents, all_states) ] return actions def step(self, states, actions, rewards, next_states, dones): # Save experience in replay memory for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn # print(len(self.memory)) if len(self.memory) > BATCH_SIZE: for agent in self.ddpg_agents: if self.shared_replay_buffer: experiences = self.memory.sample() else: experiences = agent.memory.sample() agent.learn(experiences, GAMMA)
def __init__(self, config): self.config = config self.n_agents = config.env.n_agents self.ddpg_agents = [ Agent(i, config) for i in range(self.config.env.n_agents) ] # the shared replay buffer self.memory = ReplayBuffer(config) self.t_step = 0
def __init__(self, state_size, action_size, num_agents, random_seed): self.state_size = state_size self.action_size = action_size self.random_seed = random.seed(random_seed) self.agents = [Agent(state_size, action_size, random_seed) ] * num_agents self.shared_memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, config): self.config = config # Replay memory self.memory = ReplayBuffer(self.config.action_size, self.config.buffer_size, self.config.batch_size, self.config.seed) self.agents = [ Agent(self.config) for _ in range(self.config.num_agents) ] # 'action_size', 'num_agents', and 'random_seed' #self.agents = [Agent(self.config, self.config.action_size, self.config.num_agents, self.config.random_seed) for _ in range(self.config.num_agents)] self.t_step = 0 self.loss = (0.0, 0.0)
class MADDPGAgent: def __init__(self, state_size, action_size, num_agents, random_seed): self.state_size = state_size self.action_size = action_size self.random_seed = random.seed(random_seed) self.agents = [Agent(state_size, action_size, random_seed) ] * num_agents self.shared_memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, states, actions, rewards, next_states, dones, step): for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.shared_memory.add(state, action, reward, next_state, done) if len(self.shared_memory) > BATCH_SIZE and step % LEARN_EVERY == 0: for _ in range(LEARN_N_TIMES): for agent in self.agents: experiences = self.shared_memory.sample() agent.learn(experiences, GAMMA) def act(self, states, add_noise=True): actions = [] for state, agent in zip(states, self.agents): state = np.expand_dims(state, axis=0) action = agent.act(state) action = np.reshape(action, newshape=(-1)) actions.append(action) actions = np.stack(actions) return actions def save_weights(self): for i, agent in enumerate(self.agents): torch.save(agent.actor_local.state_dict(), 'checkpoint_actor_' + str(i) + '.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic_' + str(i) + '.pth') def load_weights(self): for i, agent in enumerate(self.agents): agent.actor_local.load_state_dict( torch.load('checkpoint_actor_' + str(i) + '.pth')) agent.critic_local.load_state_dict( torch.load('checkpoint_critic_' + str(i) + '.pth')) def reset(self): for agent in self.agents: agent.reset()
def __init__(self, state_size, action_size, num_agents, random_seed): self.state_size = state_size self.action_size = action_size self.num_agents = num_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(ddpg.device) self.actor_target = Actor(state_size, action_size, random_seed).to(ddpg.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ddpg.LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(ddpg.device) self.critic_target = Critic(state_size, action_size, random_seed).to(ddpg.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=ddpg.LR_CRITIC, weight_decay=ddpg.WEIGHT_DECAY) # Replay memory self.memory = ReplayBuffer(action_size, ddpg.BUFFER_SIZE, ddpg.BATCH_SIZE, random_seed) # Create agents self.agents = [] for i in range(num_agents): agent = Agent(self, state_size, action_size, random_seed) self.agents.append(agent)
def __init__(self, num_agents=2, state_size=24, action_size=2): """Initialize a maddpg_agent wrapper. Params ====== num_agents (int): the number of agents in the environment state_size (int): dimension of each state action_size (int): dimension of each action """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.agents = [ddpg_agent(state_size, action_size, i+1, random_seed=0) for i in range(num_agents)] # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed=0)
def __init__(self, action_size=2, seed=42, n_agents=2): """ Params ====== action_size (int): dimension of each action seed (int): Random seed n_agents (int): number of distinct agents """ self.n_agents = n_agents self.timestep = 0 self.agents = [DDPG(i) for i in range(n_agents)] # common buffer for both the agents self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
class MADDPG: def __init__(self, config): self.config = config # Replay memory self.memory = ReplayBuffer(self.config.action_size, self.config.buffer_size, self.config.batch_size, self.config.seed) self.agents = [ Agent(self.config) for _ in range(self.config.num_agents) ] # 'action_size', 'num_agents', and 'random_seed' #self.agents = [Agent(self.config, self.config.action_size, self.config.num_agents, self.config.random_seed) for _ in range(self.config.num_agents)] self.t_step = 0 self.loss = (0.0, 0.0) def reset(self): for agent in self.agents: agent.reset() def act(self, states, add_noise=True): actions = [ agent.act(state, self.t_step, add_noise) for agent, state in zip(self.agents, states) ] return actions def step(self, states, actions, rewards, next_states, dones): # Save experience in replay memory for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step += 1 if self.t_step % self.config.update_every == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.config.batch_size: closs = [] aloss = [] for agent in self.agents: experiences = self.memory.sample() critic_loss, actor_loss = agent.learn( experiences, self.config.discount) closs.append(critic_loss) aloss.append(actor_loss) self.loss = (np.mean(closs), np.mean(aloss))
def __init__(self, num_agents, state_size, action_size, random_seed): self.num_agents = num_agents self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.agents = [ Agent(state_size, action_size, self.memory, BATCH_SIZE, random_seed) for agent_posit in range(num_agents) ]
def __init__(self, state_size=24, action_size=2, n_agents=2, buffer_size=100000, batch_size=256, gamma=0.999, update_every=4, noise_start=1.0, noise_decay=1.0, t_stop_noise=30000, seed=0): """ Params ====== action_size (int): dimension of each action n_agents (int): number of distinct agents buffer_size (int): replay buffer size batch_size (int): minibatch size gamma (float): discount factor noise_start (float): initial noise weighting factor noise_decay (float): noise decay rate update_every (int): how often to update the network t_stop_noise (int): max number of timesteps with noise applied in training seed (int): Random seed """ self.buffer_size = buffer_size self.batch_size = batch_size self.update_every = update_every self.gamma = gamma self.n_agents = n_agents self.noise_weight = noise_start self.noise_decay = noise_decay self.t_step = 0 self.noise_on = True self.t_stop_noise = t_stop_noise # models = [model.Actor_Critic_Models(n_agents=n_agents) for _ in range(n_agents)] self.agents = [ DDPG(i, state_size, action_size, n_agents) for i in range(n_agents) ] self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed)
def ddpg_train(n_episodes, seed, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay): memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) agents = [ Agent(state_size, action_size, seed, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, weight_decay, memory) for _ in range(num_agents) ] load(agents) scores_deque = deque(maxlen=100) scores = [] for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations episode_scores = np.zeros(num_agents) while True: for agent in agents: agent.reset() actions = list() for agent, state in zip(agents, states): actions.append(agent.act(state)) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done for agent, state, action, reward, next_state, done in zip( agents, states, actions, rewards, next_states, dones): agent.step(state, action, reward, next_state, done) states = next_states episode_scores += np.array(rewards) if np.any(dones): break score = episode_scores.max() scores_deque.append(score) scores.append(score) print('\rEpisode: \t{} \tScore: \t{:.2f} \tAverage Score: \t{:.2f}'. format(i_episode, np.mean(score), np.mean(scores_deque)), end="") if i_episode % 10 == 0: save(agents) if np.mean(scores_deque) >= 0.5: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode, np.mean(scores_deque))) break fig = plt.figure() ax = fig.add_subplot(111) ax.grid() ax.plot(np.arange(len(scores)), scores) ax.set(xlabel="Episode #", ylabel="'Score", title="DDPG Network") fig.savefig("ddpg_network.pdf")
def __init__(self, state_size, action_size, num_agents, random_seed, shared_replay_buffer): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.shared_replay_buffer = shared_replay_buffer self.t_step = 0 if shared_replay_buffer: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) shared_memory = self.memory else: shared_memory = None self.memory = None print("ma shared_memory -> ", shared_memory) self.ddpg_agents = [ Agent(state_size, action_size, random_seed, shared_memory) for _ in range(num_agents) ]
class maddpg_agent: """Wrapper class managing different agents in the environment.""" def __init__(self, num_agents=2, state_size=24, action_size=2): """Initialize a maddpg_agent wrapper. Params ====== num_agents (int): the number of agents in the environment state_size (int): dimension of each state action_size (int): dimension of each action """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.agents = [ddpg_agent(state_size, action_size, i+1, random_seed=0) for i in range(num_agents)] # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed=0) def reset(self): """Resets OU Noise for each agent.""" for agent in self.agents: agent.reset() def act(self, observations, add_noise=False): """Picks an action for each agent given.""" actions = [] for agent, observation in zip(self.agents, observations): action = agent.act(observation, add_noise=add_noise) actions.append(action) return np.array(actions) def step(self, states, actions, rewards, next_states, dones, timestep): """Save experience in replay memory.""" states = states.reshape(1, -1) actions = actions.reshape(1, -1) next_states = next_states.reshape(1, -1) self.memory.add(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and timestep%LEARNING_PERIOD == 0: for a_i, agent in enumerate(self.agents): experiences = self.memory.sample() self.learn(experiences, a_i) def learn(self, experiences, agent_number): """ The critic takes as its input the combined observations and actions from all agents. Collect actions from each agent for the 'experiences'. """ next_actions = [] actions_pred = [] states, _, _, next_states, _ = experiences next_states = next_states.reshape(-1, self.num_agents, self.state_size) states = states.reshape(-1, self.num_agents, self.state_size) for a_i, agent in enumerate(self.agents): agent_id_tensor = self._get_agent_number(a_i) state = states.index_select(1, agent_id_tensor).squeeze(1) next_state = next_states.index_select(1, agent_id_tensor).squeeze(1) next_actions.append(agent.actor_target(next_state)) actions_pred.append(agent.actor_local(state)) next_actions = torch.cat(next_actions, dim=1).to(device) actions_pred = torch.cat(actions_pred, dim=1).to(device) agent = self.agents[agent_number] agent.learn(experiences, next_actions, actions_pred) def _get_agent_number(self, i): """Helper to get an agent's number as a Torch tensor.""" return torch.tensor([i]).to(device)
class MADDPG(): """Agent that contains the two DDPG agents and shared replay buffer.""" def __init__(self, state_size=24, action_size=2, n_agents=2, buffer_size=100000, batch_size=256, gamma=0.999, update_every=4, noise_start=1.0, noise_decay=1.0, t_stop_noise=30000, seed=0): """ Params ====== action_size (int): dimension of each action n_agents (int): number of distinct agents buffer_size (int): replay buffer size batch_size (int): minibatch size gamma (float): discount factor noise_start (float): initial noise weighting factor noise_decay (float): noise decay rate update_every (int): how often to update the network t_stop_noise (int): max number of timesteps with noise applied in training seed (int): Random seed """ self.buffer_size = buffer_size self.batch_size = batch_size self.update_every = update_every self.gamma = gamma self.n_agents = n_agents self.noise_weight = noise_start self.noise_decay = noise_decay self.t_step = 0 self.noise_on = True self.t_stop_noise = t_stop_noise # models = [model.Actor_Critic_Models(n_agents=n_agents) for _ in range(n_agents)] self.agents = [ DDPG(i, state_size, action_size, n_agents) for i in range(n_agents) ] self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) def step(self, all_states, all_actions, all_rewards, all_next_states, all_dones): all_states = all_states.reshape(1, -1) all_next_states = all_next_states.reshape(1, -1) self.memory.add(all_states, all_actions, all_rewards, all_next_states, all_dones) if self.t_step > self.t_stop_noise: self.noise_on = False self.t_step += 1 if self.t_step % self.update_every == 0 and len( self.memory) > self.batch_size: experiences = [self.memory.sample() for _ in range(self.n_agents)] self.learn(experiences, self.gamma) def act(self, all_states, add_noise=True): all_actions = [] for agent, state in zip(self.agents, all_states): action = agent.act(state, noise_weight=self.noise_weight, add_noise=self.noise_on) self.noise_weight *= self.noise_decay all_actions.append(action) return np.array(all_actions).reshape(1, -1) def learn(self, experiences, gamma): all_next_actions = [] all_actions = [] for i, agent in enumerate(self.agents): states, _, _, next_states, _ = experiences[i] agent_id = torch.tensor([i]).to(device) state = states.reshape(-1, 2, 24).index_select(1, agent_id).squeeze(1) action = agent.actor_local(state) all_actions.append(action) next_state = next_states.reshape(-1, 2, 24).index_select( 1, agent_id).squeeze(1) next_action = agent.actor_target(next_state) all_next_actions.append(next_action) for i, agent in enumerate(self.agents): agent.learn(i, experiences[i], gamma, all_next_actions, all_actions) def save_agents(self): for i, agent in enumerate(self.agents): torch.save(agent.actor_local.state_dict(), f"actor_agent{i}.pth") torch.save(agent.critic_local.state_dict(), f"critic_agent{i}.pth")
torch.save(agent.actor_local.state_dict(), model_path.format('actor', i_episode)) torch.save(agent.critic_local.state_dict(), model_path.format('critic', i_episode)) np.save('scores_{0}.npy'.format(i_episode), scores) if np.mean(scores_window) >= success_score: tag = 'success' print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode - 100, np.mean(scores_window))) torch.save(agent.actor_local.state_dict(), model_path.format('actor', tag)) torch.save(agent.critic_local.state_dict(), model_path.format('critic', tag)) np.save('scores_{0}.npy'.format(tag), score) break return list_scores if __name__ == '__main__': print(opt) env = UnityEnvironment(file_name="Tennis.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] state_size, action_size = get_settings(env_info, brain) memory = ReplayBuffer(action_size, opt.buffer_size, opt.batch_size, opt.seed) agent = Agent(state_size, action_size, opt.seed, opt.buffer_size, opt.batch_size, opt.gamma, opt.tau, opt.lr_actor, opt.lr_critic, opt.weight_decay) scores = train_agent(env, agent, brain_name, opt.model_path, opt.n_episodes, opt.success_score) env.close()
class MultiAgent: """Meta agent that contains the two DDPG agents and shared replay buffer.""" def __init__(self, config): self.config = config self.n_agents = config.env.n_agents self.ddpg_agents = [ Agent(i, config) for i in range(self.config.env.n_agents) ] # the shared replay buffer self.memory = ReplayBuffer(config) self.t_step = 0 def reset(self): for agent in self.ddpg_agents: agent.reset() def step(self, states, actions, rewards, next_states, dones): states = states.reshape(1, -1) next_states = next_states.reshape(1, -1) self.memory.add(states, actions, rewards, next_states, dones) self.t_step = (self.t_step + 1) % self.config.hp.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.config.hp.batch_size: for _ in range(self.config.hp.num_updates): # each agent does it's own sampling from the replay buffer experiences = [ self.memory.sample() for _ in range(self.config.env.n_agents) ] self.learn(experiences, self.config.hp.gamma) def act(self, states, add_noise=True): # pass each agent's state from the environment and calculate it's action all_actions = [] for agent, state in zip(self.ddpg_agents, states): action = agent.act(state, add_noise=True) all_actions.append(action) return np.array(all_actions).reshape( 1, -1) # reshape 2x2 into 1x4 dim vector def learn(self, experiences, gamma): # each agent uses it's own actor to calculate next_actions all_next_actions = [] for i, agent in enumerate(self.ddpg_agents): _, _, _, next_states, _ = experiences[i] agent_id = torch.tensor([i]).to(self.config.general.device) next_state = next_states.reshape(-1, self.config.env.action_size, self.config.env.state_size) \ .index_select(1, agent_id).squeeze(1) next_action = agent.actor_target(next_state) all_next_actions.append(next_action) # each agent uses it's own actor to calculate actions all_actions = [] for i, agent in enumerate(self.ddpg_agents): states, _, _, _, _ = experiences[i] agent_id = torch.tensor([i]).to(self.config.general.device) state = states.reshape(-1, self.config.env.action_size, self.config.env.state_size)\ .index_select(1, agent_id).squeeze(1) action = agent.actor_local(state) all_actions.append(action) # each agent learns from it's experience sample for i, agent in enumerate(self.ddpg_agents): agent.learn(i, experiences[i], gamma, all_next_actions, all_actions)
class MADDPG(): def __init__(self, action_size=2, seed=42, n_agents=2): """ Params ====== action_size (int): dimension of each action seed (int): Random seed n_agents (int): number of distinct agents """ self.n_agents = n_agents self.timestep = 0 self.agents = [DDPG(i) for i in range(n_agents)] # common buffer for both the agents self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) def step(self, all_states, all_actions, all_rewards, all_next_states, all_dones): all_states = all_states.reshape( 1, -1) # reshape 2x24 into 1x48 dim vector all_next_states = all_next_states.reshape( 1, -1) # reshape 2x24 into 1x48 dim vector self.memory.add(all_states, all_actions, all_rewards, all_next_states, all_dones) self.timestep += 1 if self.timestep % 2 == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: # sample from the replay buffer for each agent experiences = [ self.memory.sample() for _ in range(self.n_agents) ] self.learn(experiences, GAMMA) def act(self, all_states, add_noise=True): # calculate each agents action all_actions = [] for agent, state in zip(self.agents, all_states): action = agent.act(state, noise_weight=0.5, add_noise=True) all_actions.append(action) return np.array(all_actions).reshape(1, -1) def learn(self, experiences, gamma): # each agent uses its own actor to calculate next_actions all_next_actions = [] all_actions = [] for i, agent in enumerate(self.agents): states, _, _, next_states, _ = experiences[i] agent_id = torch.tensor([i]).to(device) # extract agent i's state and get action via actor network state = states.reshape(-1, 2, 24).index_select(1, agent_id).squeeze(1) action = agent.actor_local(state) all_actions.append(action) # extract agent i's next state and get action via target actor network next_state = next_states.reshape(-1, 2, 24).index_select( 1, agent_id).squeeze(1) next_action = agent.actor_target(next_state) all_next_actions.append(next_action) # each agent learns from its experience sample for i, agent in enumerate(self.agents): agent.learn(i, experiences[i], gamma, all_next_actions, all_actions) def save_agents(self): # save models for i, agent in enumerate(self.agents): torch.save(agent.actor_local.state_dict(), f"checkpoint_actor_{i}.pth") torch.save(agent.critic_local.state_dict(), f"checkpoint_critic_{i}.pth")
def train( env_location, curve_path, n_episodes=1000, batch_size=512, buffer_size=int(1e6), ): env = UnityEnvironment(file_name=env_location) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) logger.info(f'Number of agents: {num_agents}') # size of each action action_size = brain.vector_action_space_size logger.info(f'Size of each action: {action_size}') # examine the state space states = env_info.vector_observations state_size = states.shape[1] logger.info( 'There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) logger.info(f'The state for the first agent looks like: {states[0]}') # reset the environment # Replay memory random_seed = 2 memory0 = ReplayBuffer(action_size, buffer_size, batch_size, random_seed) memory1 = memory0 def create_agent(memory): return Agent(state_size=states.shape[1], action_size=brain.vector_action_space_size, random_seed=random_seed, memory=memory, batch_size=batch_size) agent0 = create_agent(memory0) agent1 = create_agent(memory1) def ddpg(n_episodes, average_window=100, plot_every=4): scores_deque = deque(maxlen=average_window) scores_all = [] average_scores_all = [] for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = np.array( env_info.vector_observations, copy=True) # get the current state (for each agent) agent0.reset() agent1.reset() scores = np.zeros( num_agents) # initialize the score (for each agent) while True: action0 = agent0.act(states[0]) action1 = agent1.act(states[1]) actions = np.concatenate((action0, action1)) env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished memory0.add(states[0], action0, rewards[0], next_states[0], dones[0]) memory1.add(states[1], action1, rewards[1], next_states[1], dones[1]) agent0.step() agent1.step() scores += env_info.rewards # update the score (for each agent) states = next_states # roll over states to next time step any_done = np.any(dones) assert any_done == np.all(dones) if any_done: # exit loop if episode finished break score_episode = np.max(scores) best_agent = np.argmax(scores) scores_deque.append(score_episode) scores_all.append(score_episode) average_score_queue = np.mean(scores_deque) average_scores_all.append(average_score_queue) logger.info( '\rEpisode {}\tScore: {:.4f}\tBest Agent: {}\tAverage Score: {:.4f}' .format(i_episode, score_episode, best_agent, average_score_queue)) torch.save(agent0.actor_local.state_dict(), 'checkpoint_actor0.pth') torch.save(agent0.critic_local.state_dict(), 'checkpoint_critic0.pth') torch.save(agent1.actor_local.state_dict(), 'checkpoint_actor1.pth') torch.save(agent1.critic_local.state_dict(), 'checkpoint_critic1.pth') if i_episode > average_window and average_score_queue > 1.0: break if i_episode % plot_every == 0: plot_curve(scores_all, average_scores_all) return scores_all, average_scores_all scores, average_scores = ddpg(n_episodes=n_episodes) plot_curve(scores, average_scores) env.close() return np.max(average_scores)
def train(env_location, curve_path, n_episodes=1000): env = UnityEnvironment(file_name=env_location) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) logger.info(f'Number of agents: {num_agents}') # size of each action action_size = brain.vector_action_space_size logger.info(f'Size of each action: {action_size}') # examine the state space states = env_info.vector_observations state_size = states.shape[1] logger.info('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) logger.info(f'The state for the first agent looks like: {states[0]}') # reset the environment # Replay memory BUFFER_SIZE = int(1e6) # replay buffer size BATCH_SIZE = 1024 # minibatch size random_seed = 2 memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def create_agent(): return Agent(state_size=states.shape[1], action_size=brain.vector_action_space_size, random_seed=random_seed, memory=memory, batch_size=BATCH_SIZE) agent = create_agent() def ddpg(n_episodes, average_window=100, plot_every=4): scores_deque = deque(maxlen=100) scores_all = [] for i_episode in range(1, n_episodes+1): env_info = env.reset(train_mode=True)[brain_name] states = np.array(env_info.vector_observations, copy=True) # get the current state (for each agent) agent.reset() scores = np.zeros(num_agents) # initialize the score (for each agent) while True: actions = agent.act(states) actions = np.clip(actions, -1, 1) # all actions between -1 and 1 env_info = env.step(actions)[brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished # Add experience to replay buffer for all agents for i in range(num_agents): reward = rewards[i] # temporarily rename next_state = next_states[i] # temporarily rename done = dones[i] # temporarily rename action = actions[i] memory.add(states[i], action, reward, next_state, done) agent.step() scores += env_info.rewards # update the score (for each agent) states = next_states # roll over states to next time step any_done = np.any(done) assert any_done == np.all(done) if any_done: # exit loop if episode finished break average_score_episode = np.mean(scores) scores_deque.append(average_score_episode) scores_all.append(average_score_episode) average_score_queue = np.mean(scores_deque) logger.info('\rEpisode {}\tScore: {:.2f}\tAverage Score: {:.2f}'.format(i_episode, average_score_episode, average_score_queue)) torch.save(agent.actor_local.state_dict(), 'checkpoint_actor2.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic2.pth') if i_episode > 100 and average_score_queue > 30: break if i_episode % plot_every == 0: plot_curve(scores_all) return scores_all scores = ddpg(n_episodes=n_episodes) fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(1, len(scores)+1), scores) plt.ylabel('Score') plt.xlabel('Episode #') plt.savefig('learning.curve.png') env.close()