def train_agent(actor_learning_rate, critic_learning_rate, fc_units, thau, batch_size): # Set tunable parameters params['actor_hidden_layers'] = [int(fc_units), int(fc_units / 2)] params['critic_hidden_layers'] = [int(fc_units), int(fc_units / 2)] params['actor_learning_rate'] = actor_learning_rate params['critic_learning_rate'] = critic_learning_rate params['thau'] = thau params['batch_size'] = int(batch_size) # Create agent instance print("Created agent with following hyperparameter values:") pprint.pprint(params) # Initialize agent agent = Agent(state_size=state_size, action_size=action_size, param=params, seed=0) # Initialize replay buffer memory = ReplayBuffer(action_size, params['replay_size'], params['batch_size'], seed=0) update_interval = params['update_interval'] replay_start = params['replay_initial'] """ Training loop """ scores = [] # list containing scores from each episode scores_window = deque( maxlen=params['scores_window_size']) # last (window_size) scores filemeta = "{:s}_{:s}_{:.1E}_{:.1E}_{:d}_{:.1E}_{:d}_solved{:d}" for i_episode in range(1, params['train_episodes'] + 1): # Reset the environment env_info = env.reset(train_mode=True)[brain_name] agent.reset() # Capture the current state state = env_info.vector_observations[0] # Reset score collector score = 0 # One episode loop step = 0 done = False while not done: # Action selection action = agent.act(state) # Take action and get rewards and new state env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] # if next is terminal state # Store experience memory.push(state, action, reward, next_state, done) # Update Q-Learning step += 1 if (step % update_interval) == 0 and len(memory) > replay_start: # Rechyperparameter_optimizationall experiences (miniBatch) experiences = memory.recall() # Train agent agent.learn(experiences) # State transition state = next_state # Update total score score += reward # Push to score list scores_window.append(score) scores.append([score, np.mean(scores_window), np.std(scores_window)]) # Print episode summary print('\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}'.format( i_episode, score, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}'. format(i_episode, score, np.mean(scores_window))) if np.mean(scores_window) >= params['stop_scores']: print( '\nEnvironment solved in {:d} episodes!\tAverageimport time Score: {:.2f}' .format(i_episode - params['scores_window_size'], np.mean(scores_window))) break """ End of the Training """ print('\n') # Filename string filename = filemeta.format( params['env_name'],agent.name, \ params['actor_learning_rate'], \ params['critic_learning_rate'], \ fc_units,params['thau'], \ params['batch_size'], i_episode-100) agent.export_network('./models/{:s}'.format(filename)) # Export scores to csv file df = pandas.DataFrame(scores, columns=['scores', 'average_scores', 'std']) df.to_csv('./scores/{:s}.csv'.format(filename), sep=',', index=False) hyperscores.append([ params['actor_learning_rate'], params['critic_learning_rate'], fc_units, params['thau'], params['batch_size'], np.mean(scores_window), i_episode - params['scores_window_size'] ]) log_df = pandas.DataFrame(hyperscores, columns=[ 'actor_learning_rate', 'critic_learning_rate', 'fc_units', 'thau', 'batch_size', 'i_episode' ]) log_df.to_csv('scores/{:s}.csv'.format(log_filename)) return (params['stop_scores'] - np.mean(scores_window))
print('Number of actions : ', action_size) print(' - low:', env.action_space.low) print(' - high:', env.action_space.high) print('Dimension of state space : ', state_size) print(' - low:', env.observation_space.low) print(' - high:', env.observation_space.high) # Initialize agent agent = Agent(state_size=state_size, action_size=action_size, param=params, seed=params['random_seed']) # Initialize replay buffer memory = ReplayBuffer(action_size, params['replay_size'], params['batch_size'], seed=params['random_seed']) print('Hyperparameter values:') pprint.pprint(params) """ Training loop """ filename_format = "{:s}_{:s}_{:.1E}_{:.1E}_{:d}_{:.1E}_{:d}" scores = [] # list containing scores from each episode scores_window = deque( maxlen=params['scores_window_size']) # last (window_size) scores for i_episode in range(1, params['train_episodes'] + 1): # Reset the environment state = env.reset() agent.reset() # Reset score collector
def train_agent(trail_id): # Create agent instance print("Created agent with following hyperparameter values:") pprint.pprint(params) # Initialize agent agent = Agent(state_size=state_size, action_size=action_size, param=params, seed=params['random_seed']) # Initialize replay buffer memory = ReplayBuffer(action_size, params['replay_size'], params['batch_size'], seed=params['random_seed']) # Define parameters for exploration noise_amplitude = params['noise_amplitude_start'] noise_amplitude_final = params['noise_amplitude_final'] noise_amplitude_decay = params['noise_amplitude_decay'] """ Training loop """ max_step = 500 max_score = -np.Inf filename_format = "{:d}" scores_history = [] # list containing scores from each episode scores_window = deque( maxlen=params['scores_window_size']) # last (window_size) scores for i_episode in range(1, params['train_episodes'] + 1): # Reset the environment state = env.reset() agent.reset() # Reset score collector score = 0 # One episode loop step = 0 done = False while not np.any(done): # Get actions from all agent action = agent.act(state, noise_amplitude=noise_amplitude) # Take action and get rewards and new state next_state, reward, done, _ = env.step(action) # Store experience memory.push(state, action, reward, next_state, done) # Update the Critics and Actors of all the agents step += 1 if (step % params['update_interval'] ) == 0 and len(memory) > params['replay_initial']: # Recall experiences (miniBatch) experiences = memory.recall() # Train agent agent.learn(experiences) # State transition state = next_state # Update total score score += reward if max_step < step: break # Push to score list scores_window.append(score) scores_history.append( [score, np.mean(scores_window), np.std(scores_window)]) # Print episode summary print( '\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}, Exploration:{:1.4f}' .format(i_episode, score, np.mean(scores_window), noise_amplitude), end="") if i_episode % 100 == 0: print( '\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}, Exploration:{:1.4f}' .format(i_episode, score, np.mean(scores_window), noise_amplitude)) if np.mean(scores_window) >= params['stop_scores']: max_score = np.mean(scores_window) print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) break elif max_score < np.mean(scores_window): max_score = np.mean(scores_window) # Update exploration noise_amplitude = max(noise_amplitude_final, noise_amplitude * noise_amplitude_decay) """ End of the Training """ print('\n') # Filename string filename = "{:05d}".format(trail_id) # Export trained agent's parameters #agents.export_network('./models/{:s}'.format(filename)) # Export scores to csv file df = pandas.DataFrame(scores_history, columns=['scores', 'average_scores', 'std']) df.to_csv('./scores/optuna_logs/{:s}.csv'.format(filename), sep=',', index=False) # param_metas = [key for key in params.keys()] param_metas.extend(['scores', 'trained_episodes', 'filename']) param_values = [value for value in params.values()] param_values.extend([np.mean(scores_window), i_episode, filename]) # optuna_log.append(param_values) optuna_df = pandas.DataFrame(optuna_log, columns=param_metas) optuna_df.to_csv('scores/{:s}.csv'.format(log_filename)) # return (params['stop_scores'] - max_score)
action_size = brain.vector_action_space_size state_size = env_info.vector_observations.shape[1] print('Number of agents : ', number_of_agents) print('Number of actions : ', action_size) print('Dimension of state space : ', state_size) # Initialize agent agents = MultiAgent(number_of_agents=number_of_agents, state_size=state_size, action_size=action_size, param=params, seed=params['random_seed']) # Initialize replay buffer memory = ReplayBuffer(params['replay_size'], params['batch_size'], seed=params['random_seed']) update_interval = params['update_interval'] replay_start = params['replay_initial'] # Define parameters for training episodes = params['train_episodes'] # maximum number of training episodes stop_scores = params['stop_scores'] scores_window_size = params['scores_window_size'] # Define parameters for exploration noise_amplitude = params['noise_amplitude_start'] noise_amplitude_final = params['noise_amplitude_final'] noise_amplitude_decay = params['noise_amplitude_decay'] print('Hyperparameter values:')
number_of_agents = len(env_info.agents) action_size = brain.vector_action_space_size state_size = env_info.vector_observations.shape[1] print('Number of agents : ', number_of_agents) print('Number of actions : ', action_size) print('Dimension of state space : ', state_size) # Initialize agent agent = Agent(state_size=state_size, action_size=action_size, param=params, seed=params['random_seed']) # Initialize replay buffer memory = ReplayBuffer(action_size, params['replay_size'], params['batch_size'], seed=params['random_seed']) # Define parameters for exploration noise_amplitude = 1.0 #params['noise_amplitude_start'] noise_amplitude_final = 0.1 #params['noise_amplitude_final'] noise_amplitude_decay = 0.999 #params['noise_amplitude_decay'] print('Hyperparameter values:') pprint.pprint(params) """ Training loop """ filename_format = "{:s}_{:s}_{:.1E}_{:.1E}_{:d}_{:.1E}_{:d}" scores_history = [] # list containing scores from each episode scores_window = deque( maxlen=params['scores_window_size']) # last (window_size) scores for i_episode in range(1, params['train_episodes'] + 1):
def train_agent(): # Create agent instance print("Created agent with following hyperparameter values:") pprint.pprint(params) # Initialize agent agents = MultiAgent(number_of_agents=number_of_agents, state_size=state_size, action_size=action_size, param=params, seed=params['random_seed']) # Initialize replay buffer memory = ReplayBuffer(params['replay_size'], params['batch_size'], seed=params['random_seed']) update_interval = params['update_interval'] replay_start = params['replay_initial'] # Define parameters for training episodes = params['train_episodes'] # maximum number of training episodes stop_scores = params['stop_scores'] scores_window_size = params['scores_window_size'] # Define parameters for exploration noise_amplitude = params['noise_amplitude_start'] noise_amplitude_final = params['noise_amplitude_final'] noise_amplitude_decay = params['noise_amplitude_decay'] """ Training loop """ filename_format = "{:s}_{:s}_{:.1E}_{:.1E}_{:d}_{:.1E}_{:d}" scores_history = [] # list containing scores from each episode scores_window = deque( maxlen=scores_window_size) # last (window_size) scores for i_episode in range(1, episodes + 1): # Reset the environment env_info = env.reset(train_mode=True)[brain_name] agents.reset() # Capture the current state states = env_info.vector_observations dones = env_info.local_done # Reset score collector scores = np.zeros(number_of_agents) # One episode loop step = 0 while not np.any(dones): # Get actions from all agents actions = agents.act(states, noise_amplitude=noise_amplitude) # Take action and get rewards and new state env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished scores += env_info.rewards # update the score (for each agent) # Store experience memory.push(states, actions, rewards, next_states, dones) # Update the Critics and Actors of all the agents step += 1 if (step % update_interval) == 0 and len(memory) > replay_start: for agent_id in range(number_of_agents): # Recall experiences (miniBatch) experiences = memory.recall() # Train agent agents.learn(experiences, agent_id) # State transition states = next_states # Push to score list scores_window.append(np.max(scores)) scores_history.append([ np.max(scores), np.mean(scores_window, axis=0), np.std(scores_window, axis=0) ]) # Print episode summary print( '\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}, Exploration:{:1.4f}' .format(i_episode, np.max(scores), np.mean(scores_window), noise_amplitude), end="") if i_episode % 100 == 0: print( '\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}, Exploration:{:1.4f}' .format(i_episode, np.max(scores), np.mean(scores_window), noise_amplitude)) if np.mean(scores_window) >= params['stop_scores']: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) break # Update exploration noise_amplitude = max(noise_amplitude_final, noise_amplitude * noise_amplitude_decay) """ End of the Training """ print('\n') # Filename string filename = filename_format.format( params['env_name'],'MADDPG', \ params['actor_learning_rate'], \ params['critic_learning_rate'], \ params['actor_hidden_layers'][0], \ params['actor_thau'],params['batch_size'],i_episode-100) # Export trained agent's parameters agents.export_network('./models/{:s}'.format(filename)) # Export scores to csv file df = pandas.DataFrame(scores_history, columns=['scores', 'average_scores', 'std']) df.to_csv('./scores/{:s}.csv'.format(filename), sep=',', index=False) hyperscores.append([[value for param, value in params.items()], np.mean(scores_window), i_episode]) log_df = pandas.DataFrame( hyperscores, columns=[[param for param, value in params.items()], 'scores', 'trained_episodes']) log_df.to_csv('scores/{:s}.csv'.format(log_filename)) return (params['stop_scores'] - np.mean(scores_window))