def __init__(self, number_of_agents, state_size, action_size, param, seed=0): super(MultiAgent, self).__init__() # Parameter settings param['actor_state_size'] = state_size param['actor_action_size'] = action_size # Critic input = all_states + all_actions param['critic_state_size'] = state_size * number_of_agents param['critic_action_size'] = action_size * number_of_agents # Create Agent instance self.number_of_agents = number_of_agents self.agents = [Agent(0, param, seed), Agent(1, param, seed)]
def train_agent(actor_learning_rate, critic_learning_rate, fc_units, thau, batch_size): # Set tunable parameters params['actor_hidden_layers'] = [int(fc_units), int(fc_units / 2)] params['critic_hidden_layers'] = [int(fc_units), int(fc_units / 2)] params['actor_learning_rate'] = actor_learning_rate params['critic_learning_rate'] = critic_learning_rate params['thau'] = thau params['batch_size'] = int(batch_size) # Create agent instance print("Created agent with following hyperparameter values:") pprint.pprint(params) # Initialize agent agent = Agent(state_size=state_size, action_size=action_size, param=params, seed=0) # Initialize replay buffer memory = ReplayBuffer(action_size, params['replay_size'], params['batch_size'], seed=0) update_interval = params['update_interval'] replay_start = params['replay_initial'] """ Training loop """ scores = [] # list containing scores from each episode scores_window = deque( maxlen=params['scores_window_size']) # last (window_size) scores filemeta = "{:s}_{:s}_{:.1E}_{:.1E}_{:d}_{:.1E}_{:d}_solved{:d}" for i_episode in range(1, params['train_episodes'] + 1): # Reset the environment env_info = env.reset(train_mode=True)[brain_name] agent.reset() # Capture the current state state = env_info.vector_observations[0] # Reset score collector score = 0 # One episode loop step = 0 done = False while not done: # Action selection action = agent.act(state) # Take action and get rewards and new state env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] # if next is terminal state # Store experience memory.push(state, action, reward, next_state, done) # Update Q-Learning step += 1 if (step % update_interval) == 0 and len(memory) > replay_start: # Rechyperparameter_optimizationall experiences (miniBatch) experiences = memory.recall() # Train agent agent.learn(experiences) # State transition state = next_state # Update total score score += reward # Push to score list scores_window.append(score) scores.append([score, np.mean(scores_window), np.std(scores_window)]) # Print episode summary print('\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}'.format( i_episode, score, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}'. format(i_episode, score, np.mean(scores_window))) if np.mean(scores_window) >= params['stop_scores']: print( '\nEnvironment solved in {:d} episodes!\tAverageimport time Score: {:.2f}' .format(i_episode - params['scores_window_size'], np.mean(scores_window))) break """ End of the Training """ print('\n') # Filename string filename = filemeta.format( params['env_name'],agent.name, \ params['actor_learning_rate'], \ params['critic_learning_rate'], \ fc_units,params['thau'], \ params['batch_size'], i_episode-100) agent.export_network('./models/{:s}'.format(filename)) # Export scores to csv file df = pandas.DataFrame(scores, columns=['scores', 'average_scores', 'std']) df.to_csv('./scores/{:s}.csv'.format(filename), sep=',', index=False) hyperscores.append([ params['actor_learning_rate'], params['critic_learning_rate'], fc_units, params['thau'], params['batch_size'], np.mean(scores_window), i_episode - params['scores_window_size'] ]) log_df = pandas.DataFrame(hyperscores, columns=[ 'actor_learning_rate', 'critic_learning_rate', 'fc_units', 'thau', 'batch_size', 'i_episode' ]) log_df.to_csv('scores/{:s}.csv'.format(log_filename)) return (params['stop_scores'] - np.mean(scores_window))
env = gym.make(env_name) env.seed(params['random_seed']) # Get environment parameter action_size = env.action_space.shape[0] state_size = env.observation_space.shape[0] print('Number of actions : ', action_size) print(' - low:', env.action_space.low) print(' - high:', env.action_space.high) print('Dimension of state space : ', state_size) print(' - low:', env.observation_space.low) print(' - high:', env.observation_space.high) # Initialize agent agent = Agent(state_size=state_size, action_size=action_size, param=params, seed=params['random_seed']) # Filename string filename_format = "{:s}_{:s}_{:.1E}_{:.1E}_{:d}_{:.1E}_{:d}" filename = filename_format.format( params['env_name'],agent.name, \ params['actor_learning_rate'], \ params['critic_learning_rate'], \ params['actor_hidden_layers'][0], \ params['thau'],params['batch_size']) # Load the pre-trained network agent.import_network('./models/{:s}'.format(filename)) # Define parameters for test episodes = 10 # maximum number of test episodes
env = gym.make(env_name) env.seed(params['random_seed']) # Get environment parameter action_size = env.action_space.shape[0] state_size = env.observation_space.shape[0] print('Number of actions : ', action_size) print(' - low:', env.action_space.low) print(' - high:', env.action_space.high) print('Dimension of state space : ', state_size) print(' - low:', env.observation_space.low) print(' - high:', env.observation_space.high) # Initialize agent agent = Agent(state_size=state_size, action_size=action_size, param=params, seed=params['random_seed']) # Initialize replay buffer memory = ReplayBuffer(action_size, params['replay_size'], params['batch_size'], seed=params['random_seed']) print('Hyperparameter values:') pprint.pprint(params) """ Training loop """ filename_format = "{:s}_{:s}_{:.1E}_{:.1E}_{:d}_{:.1E}_{:d}" scores = [] # list containing scores from each episode scores_window = deque( maxlen=params['scores_window_size']) # last (window_size) scores
def train_agent(trail_id): # Create agent instance print("Created agent with following hyperparameter values:") pprint.pprint(params) # Initialize agent agent = Agent(state_size=state_size, action_size=action_size, param=params, seed=params['random_seed']) # Initialize replay buffer memory = ReplayBuffer(action_size, params['replay_size'], params['batch_size'], seed=params['random_seed']) # Define parameters for exploration noise_amplitude = params['noise_amplitude_start'] noise_amplitude_final = params['noise_amplitude_final'] noise_amplitude_decay = params['noise_amplitude_decay'] """ Training loop """ max_step = 500 max_score = -np.Inf filename_format = "{:d}" scores_history = [] # list containing scores from each episode scores_window = deque( maxlen=params['scores_window_size']) # last (window_size) scores for i_episode in range(1, params['train_episodes'] + 1): # Reset the environment state = env.reset() agent.reset() # Reset score collector score = 0 # One episode loop step = 0 done = False while not np.any(done): # Get actions from all agent action = agent.act(state, noise_amplitude=noise_amplitude) # Take action and get rewards and new state next_state, reward, done, _ = env.step(action) # Store experience memory.push(state, action, reward, next_state, done) # Update the Critics and Actors of all the agents step += 1 if (step % params['update_interval'] ) == 0 and len(memory) > params['replay_initial']: # Recall experiences (miniBatch) experiences = memory.recall() # Train agent agent.learn(experiences) # State transition state = next_state # Update total score score += reward if max_step < step: break # Push to score list scores_window.append(score) scores_history.append( [score, np.mean(scores_window), np.std(scores_window)]) # Print episode summary print( '\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}, Exploration:{:1.4f}' .format(i_episode, score, np.mean(scores_window), noise_amplitude), end="") if i_episode % 100 == 0: print( '\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}, Exploration:{:1.4f}' .format(i_episode, score, np.mean(scores_window), noise_amplitude)) if np.mean(scores_window) >= params['stop_scores']: max_score = np.mean(scores_window) print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) break elif max_score < np.mean(scores_window): max_score = np.mean(scores_window) # Update exploration noise_amplitude = max(noise_amplitude_final, noise_amplitude * noise_amplitude_decay) """ End of the Training """ print('\n') # Filename string filename = "{:05d}".format(trail_id) # Export trained agent's parameters #agents.export_network('./models/{:s}'.format(filename)) # Export scores to csv file df = pandas.DataFrame(scores_history, columns=['scores', 'average_scores', 'std']) df.to_csv('./scores/optuna_logs/{:s}.csv'.format(filename), sep=',', index=False) # param_metas = [key for key in params.keys()] param_metas.extend(['scores', 'trained_episodes', 'filename']) param_values = [value for value in params.values()] param_values.extend([np.mean(scores_window), i_episode, filename]) # optuna_log.append(param_values) optuna_df = pandas.DataFrame(optuna_log, columns=param_metas) optuna_df.to_csv('scores/{:s}.csv'.format(log_filename)) # return (params['stop_scores'] - max_score)