def train_agents(n_episodes=10000, t_max=1000): env = UnityEnvironment(file_name="envs/Tennis.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] seeding(seed=42) state_size = env_info.vector_observations.shape[1] action_size = brain.vector_action_space_size num_agents = env_info.vector_observations.shape[0] maddpg = MADDPG(state_size=state_size, action_size=action_size, num_agents=num_agents) scores_deque = deque(maxlen=100) scores_list = [] for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations scores = np.zeros(num_agents) for _ in range(t_max): actions = maddpg.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done scores += rewards maddpg.step(states, actions, rewards, next_states, dones) states = next_states if np.any(dones): break scores_deque.append(np.max(scores)) scores_list.append(np.max(scores)) print(f'\rEpisode {i_episode}\tAverage Score: {np.mean(scores_deque)}', end="") if i_episode % PRINT_EVERY == 0: print( f'\rEpisode {i_episode}\tAverage Score: {np.mean(scores_deque) : .3f}' ) if np.mean(scores_deque) >= 2.0 and len(scores_deque) >= 100: for i, agent in enumerate(maddpg.agents): torch.save(agent.actor_local.state_dict(), f'models/checkpoint_actor_local_{i}.pth') torch.save(agent.critic_local.state_dict(), f'models/checkpoint_critic_local_{i}.pth') print( f'\nSaved Model: Episode {i_episode}\tAverage Score: {np.mean(scores_deque) : .3f}' ) break return scores_list
def trainFunction(state_size, action_size, n_episodes=4000, num_agents=2): magent = MADDPG(action_size=action_size, noise_start=1.0, seed=2, gamma=0.99, t_stop_noise=30000) scores = [] scores_deque = deque(maxlen=100) scores_avg = [] for i_episode in range(1, n_episodes + 1): rewards = [] env_info = env.reset( train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) if i_episode % 2: update = True # loop over steps while True: # select an action joint_actions = magent.act(states, update) update = False # take action in environment and set parameters to new values env_info = env.step(joint_actions)[brain_name] next_states = env_info.vector_observations rewards_v = env_info.rewards done_v = env_info.local_done # update and train agent with returned information magent.step(states, joint_actions, rewards_v, next_states, done_v) states = next_states rewards.append(rewards_v) if any(done_v): break # calculate episode reward as maximum of individually collected rewards of agents episode_reward = np.max(np.sum(np.array(rewards), axis=0)) scores.append( episode_reward) # save most recent score to overall score array scores_deque.append( episode_reward ) # save most recent score to running window of 100 last scores current_avg_score = np.mean(scores_deque) scores_avg.append( current_avg_score ) # save average of last 100 scores to average score array print('\rEpisode {}\tAverage Score: {:.3f}'.format( i_episode, current_avg_score), end="") # log average score every 200 episodes if i_episode % 200 == 0: print('\rEpisode {}\tAverage Score: {:.3f}'.format( i_episode, current_avg_score)) # break and report success if environment is solved if np.mean(scores_deque) >= .5 and i_episode % 200 == 0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}' .format(i_episode, np.mean(scores_deque))) magent.save()
def train(env, num_episodes=5000, max_t=1000, warmup_episodes=0): """ Monitor agent's performance. Params ====== - env: instance of the environment - num_episodes: maximum number of episodes of agent-environment interaction - max_t: maximum number of timesteps per episode - warmup_episodes: how many episodes to explore and collect samples before learning begins Returns ======= - scores: list containing received rewards """ # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) # amplitude of OU noise # this slowly decreases to 0 noise = 1.0 noise_reduction = 0.9999 # list containing max scores from each episode episode_scores = [] # last 100 scores scores_window = deque(maxlen=100) mean_score = 0.0 maddpg = MADDPG(state_size, action_size, num_agents * state_size, num_agents * action_size) # for each episode for i_episode in range(1, num_episodes + 1): # reset the environment and begin the episode env_info = env.reset(train_mode=True)[brain_name] maddpg.reset() # get the current state (for each agent) states = env_info.vector_observations # initialize the score (for each agent) scores = np.zeros(num_agents) for t in range(max_t): # select an action (for each agent) if i_episode > warmup_episodes: actions = maddpg.act(states, noise) noise *= noise_reduction else: # Collect random samples to explore and fill the replay buffer actions = np.random.uniform(-1, 1, (num_agents, action_size)) # send all actions to the environment env_info = env.step(actions)[brain_name] # get next state (for each agent) next_states = env_info.vector_observations # get reward (for each agent) rewards = env_info.rewards # see if episode finished dones = env_info.local_done # agents perform internal updates based on sampled experience maddpg.step(states, actions, rewards, next_states, dones) # roll over states to next time step states = next_states # learn when time is right if t % LEARN_EVERY == 0 and i_episode > warmup_episodes: for _ in range(LEARN_BATCH): maddpg.learn() # update the score (for each agent) scores += rewards # exit loop if episode finished if np.any(dones): break episode_max_score = np.max(scores) episode_scores.append(episode_max_score) if i_episode > warmup_episodes: # save final score scores_window.append(episode_max_score) mean_score = np.mean(scores_window) # monitor progress if i_episode % 10 == 0: print("\rEpisode {:d}/{:d} || Average score {:.2f}".format( i_episode, num_episodes, mean_score)) else: print("\rWarmup episode {:d}/{:d}".format(i_episode, warmup_episodes), end="") if i_episode % SAVE_EVERY == 0 and i_episode > warmup_episodes: maddpg.save_weights(i_episode) # check if task is solved if i_episode >= 100 and mean_score >= 0.5: print( '\nEnvironment solved in {:d} episodes. Average score: {:.2f}'. format(i_episode, mean_score)) maddpg.save_weights() break if i_episode == num_episodes: print("\nGame over. Too bad! Final score {:.2f}\n".format(mean_score)) return episode_scores
def train_maddpg(env, max_episode=1000, max_t=1000, print_every=5, check_history=100, sigma_start=0.2, sigma_end=0.01, sigma_decay=0.995): # reset brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] # action and state size action_size = brain.vector_action_space_size states = env_info.vector_observations state_size = states.shape[1] print('State size:', state_size) print('Action size: ', action_size) # initialize agent num_agents = len(env_info.agents) print('Number of agents:', num_agents) maddpg = MADDPG(state_size, action_size, random_seed=123) scores_deque = deque(maxlen=check_history) scores = [] # learning multiple episodes sigma = sigma_start for episode in range(max_episode): # prepare for training in the current epoc env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations score = 0 maddpg.reset(sigma=sigma) # play and learn in current episode for t in range(max_t): actions = maddpg.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished maddpg.step(t, states, actions, rewards, next_states, dones) states = next_states reward = np.max( rewards) # get max score of two agents as current score score += reward if np.any(dones): break # update sigma for exlporation sigma = max(sigma_end, sigma * sigma_decay) # record score epoc_score = score scores_deque.append(epoc_score) scores.append(epoc_score) if episode % print_every == 0: print('Episode {}\tscore: {:.4f}\tAverage Score: {:.4f}'.format( episode, epoc_score, np.mean(scores_deque))) if np.mean(scores_deque) >= 0.5 and episode >= check_history: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.4f}' .format(episode - check_history, np.mean(scores_deque))) for agent in maddpg.ddpg_agents: torch.save(agent.actor_local.state_dict(), 'actor_agent_' + str(agent.id) + '.pth') torch.save(agent.critic_local.state_dict(), 'critic_agent_' + str(agent.id) + '.pth') break return scores