Python Agent 예제들, agent.DDPG.Agent Python 예제들

예제 #1

0

파일 보기

    def __init__(self,
                 number_of_agents,
                 state_size,
                 action_size,
                 param,
                 seed=0):
        super(MultiAgent, self).__init__()

        # Parameter settings
        param['actor_state_size'] = state_size
        param['actor_action_size'] = action_size

        # Critic input = all_states + all_actions
        param['critic_state_size'] = state_size * number_of_agents
        param['critic_action_size'] = action_size * number_of_agents

        # Create Agent instance
        self.number_of_agents = number_of_agents
        self.agents = [Agent(0, param, seed), Agent(1, param, seed)]

예제 #2

0

파일 보기

파일: hyperparameter_tuning.py 프로젝트: dganbold/deep_reinforcement_learning

def train_agent(actor_learning_rate, critic_learning_rate, fc_units, thau,
                batch_size):
    # Set tunable parameters
    params['actor_hidden_layers'] = [int(fc_units), int(fc_units / 2)]
    params['critic_hidden_layers'] = [int(fc_units), int(fc_units / 2)]
    params['actor_learning_rate'] = actor_learning_rate
    params['critic_learning_rate'] = critic_learning_rate
    params['thau'] = thau
    params['batch_size'] = int(batch_size)

    # Create agent instance
    print("Created agent with following hyperparameter values:")
    pprint.pprint(params)

    # Initialize agent
    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  param=params,
                  seed=0)

    # Initialize replay buffer
    memory = ReplayBuffer(action_size,
                          params['replay_size'],
                          params['batch_size'],
                          seed=0)
    update_interval = params['update_interval']
    replay_start = params['replay_initial']
    """ Training loop  """
    scores = []  # list containing scores from each episode
    scores_window = deque(
        maxlen=params['scores_window_size'])  # last (window_size) scores
    filemeta = "{:s}_{:s}_{:.1E}_{:.1E}_{:d}_{:.1E}_{:d}_solved{:d}"
    for i_episode in range(1, params['train_episodes'] + 1):
        # Reset the environment
        env_info = env.reset(train_mode=True)[brain_name]
        agent.reset()

        # Capture the current state
        state = env_info.vector_observations[0]

        # Reset score collector
        score = 0
        # One episode loop
        step = 0
        done = False
        while not done:
            # Action selection
            action = agent.act(state)

            # Take action and get rewards and new state
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]  # if next is terminal state

            # Store experience
            memory.push(state, action, reward, next_state, done)

            # Update Q-Learning
            step += 1
            if (step % update_interval) == 0 and len(memory) > replay_start:
                # Rechyperparameter_optimizationall experiences (miniBatch)
                experiences = memory.recall()
                # Train agent
                agent.learn(experiences)

            # State transition
            state = next_state

            # Update total score
            score += reward

        # Push to score list
        scores_window.append(score)
        scores.append([score, np.mean(scores_window), np.std(scores_window)])

        # Print episode summary
        print('\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}'.format(
            i_episode, score, np.mean(scores_window)),
              end="")
        if i_episode % 100 == 0:
            print('\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}'.
                  format(i_episode, score, np.mean(scores_window)))
        if np.mean(scores_window) >= params['stop_scores']:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverageimport time Score: {:.2f}'
                .format(i_episode - params['scores_window_size'],
                        np.mean(scores_window)))
            break
    """ End of the Training """
    print('\n')

    # Filename string
    filename = filemeta.format( params['env_name'],agent.name,      \
                                params['actor_learning_rate'],      \
                                params['critic_learning_rate'],     \
                                fc_units,params['thau'],            \
                                params['batch_size'], i_episode-100)
    agent.export_network('./models/{:s}'.format(filename))
    # Export scores to csv file
    df = pandas.DataFrame(scores, columns=['scores', 'average_scores', 'std'])
    df.to_csv('./scores/{:s}.csv'.format(filename), sep=',', index=False)

    hyperscores.append([
        params['actor_learning_rate'], params['critic_learning_rate'],
        fc_units, params['thau'], params['batch_size'],
        np.mean(scores_window), i_episode - params['scores_window_size']
    ])
    log_df = pandas.DataFrame(hyperscores,
                              columns=[
                                  'actor_learning_rate',
                                  'critic_learning_rate', 'fc_units', 'thau',
                                  'batch_size', 'i_episode'
                              ])
    log_df.to_csv('scores/{:s}.csv'.format(log_filename))

    return (params['stop_scores'] - np.mean(scores_window))

예제 #3

0

파일 보기

파일: test.py 프로젝트: dganbold/deep_reinforcement_learning

env = gym.make(env_name)
env.seed(params['random_seed'])

# Get environment parameter
action_size = env.action_space.shape[0]
state_size = env.observation_space.shape[0]
print('Number of actions : ', action_size)
print('  - low:', env.action_space.low)
print('  - high:', env.action_space.high)
print('Dimension of state space : ', state_size)
print('  - low:', env.observation_space.low)
print('  - high:', env.observation_space.high)

# Initialize agent
agent = Agent(state_size=state_size,
              action_size=action_size,
              param=params,
              seed=params['random_seed'])

# Filename string
filename_format = "{:s}_{:s}_{:.1E}_{:.1E}_{:d}_{:.1E}_{:d}"
filename = filename_format.format(  params['env_name'],agent.name,      \
                                    params['actor_learning_rate'],      \
                                    params['critic_learning_rate'],     \
                                    params['actor_hidden_layers'][0],   \
                                    params['thau'],params['batch_size'])

# Load the pre-trained network
agent.import_network('./models/{:s}'.format(filename))

# Define parameters for test
episodes = 10  # maximum number of test episodes

예제 #4

0

파일 보기

env = gym.make(env_name)
env.seed(params['random_seed'])

# Get environment parameter
action_size = env.action_space.shape[0]
state_size = env.observation_space.shape[0]
print('Number of actions : ', action_size)
print('  - low:', env.action_space.low)
print('  - high:', env.action_space.high)
print('Dimension of state space : ', state_size)
print('  - low:', env.observation_space.low)
print('  - high:', env.observation_space.high)

# Initialize agent
agent = Agent(state_size=state_size,
              action_size=action_size,
              param=params,
              seed=params['random_seed'])

# Initialize replay buffer
memory = ReplayBuffer(action_size,
                      params['replay_size'],
                      params['batch_size'],
                      seed=params['random_seed'])

print('Hyperparameter values:')
pprint.pprint(params)
""" Training loop  """
filename_format = "{:s}_{:s}_{:.1E}_{:.1E}_{:d}_{:.1E}_{:d}"
scores = []  # list containing scores from each episode
scores_window = deque(
    maxlen=params['scores_window_size'])  # last (window_size) scores

예제 #5

0

파일 보기

def train_agent(trail_id):
    # Create agent instance
    print("Created agent with following hyperparameter values:")
    pprint.pprint(params)

    # Initialize agent
    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  param=params,
                  seed=params['random_seed'])

    # Initialize replay buffer
    memory = ReplayBuffer(action_size,
                          params['replay_size'],
                          params['batch_size'],
                          seed=params['random_seed'])

    # Define parameters for exploration
    noise_amplitude = params['noise_amplitude_start']
    noise_amplitude_final = params['noise_amplitude_final']
    noise_amplitude_decay = params['noise_amplitude_decay']
    """ Training loop  """
    max_step = 500
    max_score = -np.Inf
    filename_format = "{:d}"
    scores_history = []  # list containing scores from each episode
    scores_window = deque(
        maxlen=params['scores_window_size'])  # last (window_size) scores
    for i_episode in range(1, params['train_episodes'] + 1):
        # Reset the environment
        state = env.reset()
        agent.reset()
        # Reset score collector
        score = 0
        # One episode loop
        step = 0
        done = False
        while not np.any(done):
            # Get actions from all agent
            action = agent.act(state, noise_amplitude=noise_amplitude)

            # Take action and get rewards and new state
            next_state, reward, done, _ = env.step(action)

            # Store experience
            memory.push(state, action, reward, next_state, done)

            # Update the Critics and Actors of all the agents
            step += 1
            if (step % params['update_interval']
                ) == 0 and len(memory) > params['replay_initial']:
                # Recall experiences (miniBatch)
                experiences = memory.recall()
                # Train agent
                agent.learn(experiences)

            # State transition
            state = next_state

            # Update total score
            score += reward

            if max_step < step:
                break

        # Push to score list
        scores_window.append(score)
        scores_history.append(
            [score, np.mean(scores_window),
             np.std(scores_window)])

        # Print episode summary
        print(
            '\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}, Exploration:{:1.4f}'
            .format(i_episode, score, np.mean(scores_window), noise_amplitude),
            end="")
        if i_episode % 100 == 0:
            print(
                '\r#TRAIN Episode:{}, Score:{:.2f}, Average Score:{:.2f}, Exploration:{:1.4f}'
                .format(i_episode, score, np.mean(scores_window),
                        noise_amplitude))
        if np.mean(scores_window) >= params['stop_scores']:
            max_score = np.mean(scores_window)
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode - 100, np.mean(scores_window)))
            break
        elif max_score < np.mean(scores_window):
            max_score = np.mean(scores_window)

        # Update exploration
        noise_amplitude = max(noise_amplitude_final,
                              noise_amplitude * noise_amplitude_decay)
    """ End of the Training """
    print('\n')

    # Filename string
    filename = "{:05d}".format(trail_id)
    # Export trained agent's parameters
    #agents.export_network('./models/{:s}'.format(filename))
    # Export scores to csv file
    df = pandas.DataFrame(scores_history,
                          columns=['scores', 'average_scores', 'std'])
    df.to_csv('./scores/optuna_logs/{:s}.csv'.format(filename),
              sep=',',
              index=False)
    #
    param_metas = [key for key in params.keys()]
    param_metas.extend(['scores', 'trained_episodes', 'filename'])
    param_values = [value for value in params.values()]
    param_values.extend([np.mean(scores_window), i_episode, filename])
    #
    optuna_log.append(param_values)
    optuna_df = pandas.DataFrame(optuna_log, columns=param_metas)
    optuna_df.to_csv('scores/{:s}.csv'.format(log_filename))
    #
    return (params['stop_scores'] - max_score)