def learn(self, gamma): """Learn from experiences""" actor_losses = [] critic_losses = [] self.learn_step += 1 for i in range(self.num_agents): experiences = self.memory.sample() actor_loss, critic_loss = self._learn( experiences, gamma, self.actor_local[i], self.actor_target[i], self.critic_local[i], self.critic_target[i], self.actor_optimizer[i], self.critic_optimizer[i]) actor_losses.append(actor_loss) critic_losses.append(critic_loss) if self.learn_step % self.print_every == 0: self.writer.text('critic loss: {}'.format(np.mean(critic_losses)), "Critic Multi Agent") save_to_txt(np.mean(critic_losses), '{}/critic_losses_multi.txt'.format(self.dirname)) self.writer.push(np.mean(critic_losses), "Loss(critic)") self.writer.text('actor loss: {}'.format(np.mean(actor_losses)), "Actor Multi Agent") save_to_txt(np.mean(actor_losses), '{}/actor_losses_multi.txt'.format(self.dirname)) self.writer.push(np.mean(actor_losses), "Loss(actor)")
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--num_episodes", type=int, default=1000, help="Total number of episodes to train") parser.add_argument("--max_t", type=int, default=1000, help="Max timestep in a single episode") parser.add_argument("--vis", type=bool, default=True, help="Whether to use visdom to visualise training") parser.add_argument("--model", type=str, default=None, help="Model checkpoint path, use if you wish to continue training from a checkpoint") parser.add_argument("--info", type=str, default="", help="Use this to attach notes to your runs") parser.add_argument("--stop_on_solve", type=bool, default=True, help="Stop as soon as the environment is solved") args = parser.parse_args() # visualiser writer = VisWriter(vis=args.vis) # save info/comments about the experiment save_to_txt(args.info, '{}/info.txt'.format(dirname)) # Unity Env env = UnityEnvironment(file_name='env/Tennis_Linux_NoVis/Tennis.x86_64') # brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) state = env_info.vector_observations state_shape = state.shape[1] action_size = brain.vector_action_space_size agent = DDPGMultiAgent(state_shape, action_size, num_agents, writer=writer, random_seed=10, dirname=dirname, print_every=100, model_path=args.model) scores = ddpg(env, brain_name, num_agents, agent, writer, n_episodes=args.num_episodes, max_t=args.max_t, stop_on_solve=args.stop_on_solve) # save all scores save_to_txt('\n'.join([score.tolist() for score in scores]), '{}/scores_multi_full.txt'.format(dirname))
def step(self, states, actions, rewards, next_states, dones): """Performs the learning step. Save experience in replay memory, and sample uniformly at random from buffer to learn. """ self.learn_step += 1 # store a single entry for each step i.e the experience of # each agent for a step gets stored as single entry. states = np.expand_dims(states, 0) actions = np.expand_dims( np.array(actions).reshape(self.num_agents, self.action_size), 0) rewards = np.expand_dims( np.array(rewards).reshape(self.num_agents, -1), 0) dones = np.expand_dims(np.array(dones).reshape(self.num_agents, -1), 0) next_states = np.expand_dims( np.array(next_states).reshape(self.num_agents, -1), 0) # Use debugger to explore the shape # import pdb; pdb.set_trace() self.memory.add(states, actions, rewards, next_states, dones) # Get agent to learn from experience if we have enough data/experiences in memory if len(self.memory) < self.config['BATCH_SIZE']: return if not self.learn_step % self.config['LEARN_STEP'] == 0: return experiences = self.memory.sample() actor_losses = [] critic_losses = [] for agent in self.agents: actor_loss, critic_loss = agent.learn(self.agents, experiences, self.config['GAMMA']) actor_losses.append(actor_loss) critic_losses.append(critic_loss) # Plot real-time graphs and store losses if self.learn_step % self.print_every == 0: # Save Critic loss save_to_txt(critic_losses, '{}/critic_losses.txt'.format(self.dirname)) self.writer.text('critic loss: {}'.format(critic_losses), "Critic") self.writer.push(critic_losses, "Loss(critic)") # Save Actor loss save_to_txt(actor_losses, '{}/actor_losses.txt'.format(self.dirname)) self.writer.text('actor loss: {}'.format(actor_losses), "Actor") self.writer.push(actor_losses, "Loss(actor)")
def step(self, states, actions, rewards, next_states, dones): """Performs the learning step. """ # store a single entry for results from all agents by adding axis=0 states, actions, rewards, next_states, dones = self.reshape( states, actions, rewards, next_states, dones) self.memory.add(states, actions, rewards, next_states, dones) # Get agent to learn from experience if we have enough data/experiences in memory if len( self.memory ) > self.batch_size and self.learn_step % self.update_every == 0: experiences = self.memory.sample() actor_losses = [] critic_losses = [] for agent in self.agents: actor_loss, critic_loss = agent.learn(self.agents, experiences, self.gamma) actor_losses.append(actor_loss) critic_losses.append(critic_loss) # Plot real-time graphs and store losses if self.learn_step % self.print_every == 0: # Save Critic loss utils.save_to_txt( critic_losses, '{}/critic_losses.txt'.format(self.result_dir)) self.writer.text('critic loss: {}'.format(critic_losses), 'Critic') self.writer.push(critic_losses, 'Loss(critic)') # Save Actor loss utils.save_to_txt( actor_losses, '{}/actor_losses.txt'.format(self.result_dir)) self.writer.text('actor loss: {}'.format(actor_losses), 'Actor') self.writer.push(actor_losses, 'Loss(actor)') self.critic_loss = np.array(critic_losses).mean() self.actor_loss = np.array(actor_losses).mean() self.learn_step += 1 return self.critic_loss, self.actor_loss
def maddpg(env, brain_name, num_agents, agent, writer, n_episodes=300, max_t=1000, print_every=50, stop_on_solve=True): """Train DDPG Agent Params ====== env (object): Unity environment instance brain_name (string): name of brain num_agents (int): number of agents agent (DDPGMultiAgent): agent instance writer (VisWriter): Visdom visualiser for realtime plots n_episodes (int): number of episodes to train the network max_t (int): number of timesteps in each episode print_every (int): how often to print the progress stop_on_solve (bool): whether to stop training as soon as environment is solved """ best_score = -np.inf scores_deque = deque(maxlen=100) maxt_deque = deque(maxlen=20) best_maxt = 0 scores = [] for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations agent.reset() score = np.zeros(num_agents) for t in range(max_t): actions = agent.act(states) env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished score += env_info.rewards # update the score (for each agent) agent.step(states, actions, rewards, next_states, dones) states = next_states # roll over states to next time step if np.any(dones): logger.debug('Episode {} done at t = {}'.format(i_episode, t)) maxt_deque.append(t) if t >= best_maxt: best_maxt = t break # exit loop if episode finished scores_deque.append(np.max(score)) scores.append(np.max(score)) current_score = np.mean(scores_deque) # keep storing current score (incase we terminate, we'll have data for plotting/comparison) save_to_txt(current_score, '{}/scores_multi.txt'.format(dirname)) # Publish and save writer.text( 'Episode {}/{}: Average score(100): {}'.format( i_episode, n_episodes, current_score), "Average 100 episodes") writer.push(np.mean(scores_deque), "Average Score") logger.info( 'Episode {}\tAverage Score: {:.2f}, Average max_t: {:.2f}, Best max_t: {}' .format(i_episode, current_score, np.mean(maxt_deque), best_maxt)) if len(scores) > 0: writer.push(scores[-1], "Score") # if current_score >= best_score: if current_score > best_score: logger.info('Best score found, old: {}, new: {}'.format( best_score, current_score)) best_score = current_score agent.checkpoint() if i_episode % print_every == 0: logger.info('Episode {}\tAverage Score: {:.2f}'.format( i_episode, current_score)) # check environment solved if current_score >= 0.5: logger.info('Environment solved in {} episodes'.format(i_episode)) if stop_on_solve: logger.info('Terminating agent training') break logger.info('Final Average Score: {:.2f}'.format(current_score)) return scores
def train( agent_group: mag.MADDPGAgentGroup, env: UnityEnvironment, brain_name: int, num_agents: int, writer: utils.VisWriter, result_dir: str, logger, num_episodes: int = 10000, max_t: int = 5000, print_every: int = 100, passing_score: float = 0.5, ): scores_deque = deque(maxlen=print_every) max_t_deque = deque(maxlen=print_every) i_episode = 0 scores = [] current_t = 0 best_max_t = 0 for i_episode in range(1, num_episodes + 1): # reset the environment env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations score = np.zeros(num_agents) agent_group.reset() critic_loss, actor_loss = 0, 0 t = 0 for t in range(max_t): # agent acts actions = agent_group.act(states) if current_t % print_every == 0: for i in range(actions[0].shape[0]): action_from_dim = [a[i] for a in actions] writer.push(action_from_dim, f'Actions(dim-{i})') # receives feedback from env env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done score += rewards # agent explores or learns critic_loss, actor_loss = agent_group.step(states, actions, rewards, next_states, dones) states = next_states if np.any(dones): logger.debug('Episode {} done at t = {}'.format(i_episode, t)) if t >= best_max_t: best_max_t = t max_t_deque.append(best_max_t) break current_t += 1 max_score = np.max(score) scores_deque.append(max_score) scores.append(max_score) current_score = np.mean(scores_deque) # keep track of scores utils.save_to_txt(current_score, '{}/scores.txt'.format(result_dir)) logger.info( f'Episode {i_episode}, score : {max_score:.3f}. Average score: {current_score:.3f}. (critic_loss: {critic_loss:.7f}, actor_loss:{actor_loss:.7f})' ) # Publish and save writer.text( 'Episode {}/{}: Average score(100): {}'.format( i_episode, num_episodes, current_score), 'Average 100 episodes') writer.push(current_score, 'Average Score') logger.info( 'Episode {}\tAverage Score: {:.2f}, Average max_t: {:.2f}, Best max_t: {}' .format(i_episode, current_score, np.mean(max_t_deque), best_max_t)) if len(scores) > 0: writer.push(scores[-1], 'Score') if current_score >= passing_score: logger.info( f'\nEnvironment solved in {i_episode-100:d} episodes!\tAverage Score: {np.mean(scores_deque):.4f}, passing score: {passing_score}. Saving models.' ) break # save models agent_group.save() return scores