def train_agent(actor_learning_rate, critic_learning_rate, fc_units, thau,
                batch_size):
    # Set tunable parameters
    params['actor_hidden_layers'] = [int(fc_units), int(fc_units / 2)]
    params['critic_hidden_layers'] = [int(fc_units), int(fc_units / 2)]
    params['actor_learning_rate'] = actor_learning_rate
    params['critic_learning_rate'] = critic_learning_rate
    params['thau'] = thau
    params['batch_size'] = int(batch_size)

    # Create agent instance
    print("Created agent with following hyperparameter values:")
    pprint.pprint(params)

    # Initialize agent
    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  param=params,
                  seed=0)

    # Initialize replay buffer
    memory = ReplayBuffer(action_size,
                          params['replay_size'],
                          params['batch_size'],
                          seed=0)
    update_interval = params['update_interval']
    replay_start = params['replay_initial']
    """ Training loop  """
    scores = []  # list containing scores from each episode
    scores_window = deque(
        maxlen=params['scores_window_size'])  # last (window_size) scores
    filemeta = "{:s}_{:s}_{:.1E}_{:.1E}_{:d}_{:.1E}_{:d}_solved{:d}"
    for i_episode in range(1, params['train_episodes'] + 1):
        # Reset the environment
        env_info = env.reset(train_mode=True)[brain_name]
        agent.reset()

        # Capture the current state
        state = env_info.vector_observations[0]

        # Reset score collector
        score = 0
        # One episode loop
        step = 0
        done = False
        while not done:
            # Action selection
            action = agent.act(state)

            # Take action and get rewards and new state
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]  # if next is terminal state

            # Store experience
            memory.push(state, action, reward, next_state, done)

            # Update Q-Learning
            step += 1
            if (step % update_interval) == 0 and len(memory) > replay_start:
                # Rechyperparameter_optimizationall experiences (miniBatch)
                experiences = memory.recall()
                # Train agent
                agent.learn(experiences)

            # State transition
            state = next_state

            # Update total score
            score += reward

        # Push to score list
        scores_window.append(score)
        scores.append([score, np.mean(scores_window), np.std(scores_window)])

        # Print episode summary
        print('\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}'.format(
            i_episode, score, np.mean(scores_window)),
              end="")
        if i_episode % 100 == 0:
            print('\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}'.
                  format(i_episode, score, np.mean(scores_window)))
        if np.mean(scores_window) >= params['stop_scores']:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverageimport time Score: {:.2f}'
                .format(i_episode - params['scores_window_size'],
                        np.mean(scores_window)))
            break
    """ End of the Training """
    print('\n')

    # Filename string
    filename = filemeta.format( params['env_name'],agent.name,      \
                                params['actor_learning_rate'],      \
                                params['critic_learning_rate'],     \
                                fc_units,params['thau'],            \
                                params['batch_size'], i_episode-100)
    agent.export_network('./models/{:s}'.format(filename))
    # Export scores to csv file
    df = pandas.DataFrame(scores, columns=['scores', 'average_scores', 'std'])
    df.to_csv('./scores/{:s}.csv'.format(filename), sep=',', index=False)

    hyperscores.append([
        params['actor_learning_rate'], params['critic_learning_rate'],
        fc_units, params['thau'], params['batch_size'],
        np.mean(scores_window), i_episode - params['scores_window_size']
    ])
    log_df = pandas.DataFrame(hyperscores,
                              columns=[
                                  'actor_learning_rate',
                                  'critic_learning_rate', 'fc_units', 'thau',
                                  'batch_size', 'i_episode'
                              ])
    log_df.to_csv('scores/{:s}.csv'.format(log_filename))

    return (params['stop_scores'] - np.mean(scores_window))
예제 #2
0
agent.import_network('./models/{:s}'.format(filename))

# Define parameters for test
episodes = 10  # maximum number of test episodes
""" Test loop  """
for i_episode in range(1, episodes + 1):
    # Reset the environment
    state = env.reset()

    # Reset score collector
    score = 0
    done = False
    # One episode loop
    while not done:
        # Action selection
        action = agent.act(state)
        env.render()

        # Take action and get rewards and new state
        next_state, reward, done, _ = env.step(action)

        # State transition
        state = next_state

        # Update total score
        score += reward

    # Print episode summary
    print('\r#TEST Episode:{}, Score:{:.2f}'.format(i_episode, score))
""" End of the Test """
예제 #3
0
agent.import_network('./models/{:s}'.format(filename))

# Define parameters for test
episodes = 10  # maximum number of test episodes
""" Test loop  """
for i_episode in range(1, episodes + 1):
    # Reset the environment
    state = env.reset()

    # Reset score collector
    score = 0
    done = False
    # One episode loop
    while not done:
        # Action selection
        action = agent.act(state, add_noise=False)
        env.render()

        # Take action and get rewards and new state
        next_state, reward, done, _ = env.step(2 * action)

        # State transition
        state = next_state

        # Update total score
        score += reward

    # Print episode summary
    print('\r#TEST Episode:{}, Score:{:.2f}'.format(i_episode, score))
""" End of the Test """
예제 #4
0
def train_agent(trail_id):
    # Create agent instance
    print("Created agent with following hyperparameter values:")
    pprint.pprint(params)

    # Initialize agent
    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  param=params,
                  seed=params['random_seed'])

    # Initialize replay buffer
    memory = ReplayBuffer(action_size,
                          params['replay_size'],
                          params['batch_size'],
                          seed=params['random_seed'])

    # Define parameters for exploration
    noise_amplitude = params['noise_amplitude_start']
    noise_amplitude_final = params['noise_amplitude_final']
    noise_amplitude_decay = params['noise_amplitude_decay']
    """ Training loop  """
    max_step = 500
    max_score = -np.Inf
    filename_format = "{:d}"
    scores_history = []  # list containing scores from each episode
    scores_window = deque(
        maxlen=params['scores_window_size'])  # last (window_size) scores
    for i_episode in range(1, params['train_episodes'] + 1):
        # Reset the environment
        state = env.reset()
        agent.reset()
        # Reset score collector
        score = 0
        # One episode loop
        step = 0
        done = False
        while not np.any(done):
            # Get actions from all agent
            action = agent.act(state, noise_amplitude=noise_amplitude)

            # Take action and get rewards and new state
            next_state, reward, done, _ = env.step(action)

            # Store experience
            memory.push(state, action, reward, next_state, done)

            # Update the Critics and Actors of all the agents
            step += 1
            if (step % params['update_interval']
                ) == 0 and len(memory) > params['replay_initial']:
                # Recall experiences (miniBatch)
                experiences = memory.recall()
                # Train agent
                agent.learn(experiences)

            # State transition
            state = next_state

            # Update total score
            score += reward

            if max_step < step:
                break

        # Push to score list
        scores_window.append(score)
        scores_history.append(
            [score, np.mean(scores_window),
             np.std(scores_window)])

        # Print episode summary
        print(
            '\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}, Exploration:{:1.4f}'
            .format(i_episode, score, np.mean(scores_window), noise_amplitude),
            end="")
        if i_episode % 100 == 0:
            print(
                '\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}, Exploration:{:1.4f}'
                .format(i_episode, score, np.mean(scores_window),
                        noise_amplitude))
        if np.mean(scores_window) >= params['stop_scores']:
            max_score = np.mean(scores_window)
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_window)))
            break
        elif max_score < np.mean(scores_window):
            max_score = np.mean(scores_window)

        # Update exploration
        noise_amplitude = max(noise_amplitude_final,
                              noise_amplitude * noise_amplitude_decay)
    """ End of the Training """
    print('\n')

    # Filename string
    filename = "{:05d}".format(trail_id)
    # Export trained agent's parameters
    #agents.export_network('./models/{:s}'.format(filename))
    # Export scores to csv file
    df = pandas.DataFrame(scores_history,
                          columns=['scores', 'average_scores', 'std'])
    df.to_csv('./scores/optuna_logs/{:s}.csv'.format(filename),
              sep=',',
              index=False)
    #
    param_metas = [key for key in params.keys()]
    param_metas.extend(['scores', 'trained_episodes', 'filename'])
    param_values = [value for value in params.values()]
    param_values.extend([np.mean(scores_window), i_episode, filename])
    #
    optuna_log.append(param_values)
    optuna_df = pandas.DataFrame(optuna_log, columns=param_metas)
    optuna_df.to_csv('scores/{:s}.csv'.format(log_filename))
    #
    return (params['stop_scores'] - max_score)
예제 #5
0
#env = Monitor(env, './video')
#state = env.reset()
#env.render()
#input('Press enter to start:')
""" Test loop  """
for i_episode in range(1, episodes + 1):
    # Reset the environment
    state = env.reset()

    # Reset score collector
    score = 0
    done = False
    # One episode loop
    while not done:
        # Action selection
        action = agent.act(state, noise_amplitude=0.0)
        env.render()

        # Take action and get rewards and new state
        next_state, reward, done, _ = env.step(2 * action)

        # State transition
        state = next_state

        # Update total score
        score += reward

    # Print episode summary
    print('\r#TEST Episode:{}, Score:{:.2f}'.format(i_episode, score))
""" End of the Test """
예제 #6
0
    maxlen=params['scores_window_size'])  # last (window_size) scores
for i_episode in range(1, params['train_episodes'] + 1):
    # Reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    agent.reset()
    # Capture the current state
    states = env_info.vector_observations
    dones = env_info.local_done
    # Reset score collector
    scores = np.zeros(
        number_of_agents)  # initialize the score (for each agent)
    # One episode loop
    step = 0
    while not np.any(dones):
        # Action selection
        actions = agent.act(states)

        # Take action and get rewards and new state
        env_info = env.step(actions)[brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done  # if next is terminal state

        # Store experience
        #memory.push(state, action, reward, next_state, done)

        # Update critic and actor policy
        #step += 1
        #if (step % params['update_interval']) == 0 and len(memory) > params['batch_size']:
        #    # Recall experiences (miniBatch)
        #    experiences = memory.recall()