Exemple #1
0
def fill_buffer():
    agent = RandomAgent(m)
    # Reset enviroment data
    while len(buffer) < L:
        done = False
        state = env.reset()
        while not done:
            action = agent.forward(state)
            next_state, reward, done, _ = env.step(action)
            buffer.append((state, action, reward, next_state, done))
            state = next_state
    print('Buffer filled!')
Exemple #2
0
def run_sim(agent=None):
    env = gym.make('LunarLanderContinuous-v2')
    env.reset()

    # Parameters
    N_episodes = 50  # Number of episodes
    n_ep_running_average = 50  # Running average of 50 episodes
    n_actions = len(env.action_space.high)  # Action dimension
    dim_state = len(env.observation_space.high)  # State dimensionality

    # We will use these variables to compute the average episodic reward and
    # the average number of steps per episode
    episode_reward_list = []  # this list contains the total reward per episode

    if agent is None:
        actor = RandomAgent(n_actions)
    else:
        actor = agent
    for i in range(50):
        # Reset enviroment data and initialize variables
        done = False
        state = env.reset()
        total_episode_reward = 0.
        while not done:
            # Choose action at random.
            if agent is None:
                action = actor.forward(state)
            else:
                action = actor.forward(
                    torch.tensor(
                        [state],
                        dtype=torch.float32))[0].cpu().detach().numpy()
            next_state, reward, done, _ = env.step(action)

            # Update episode reward
            total_episode_reward += reward

            # Update state for next iteration
            state = next_state

        # Append episode reward and total number of steps
        episode_reward_list.append(total_episode_reward)

        # Close environment
        env.close()
    return episode_reward_list
Exemple #3
0
# Import and initialize Mountain Car Environment
env = gym.make('LunarLanderContinuous-v2')
env.reset()

# Parameters
N_episodes = 100  # Number of episodes to run for training
discount_factor = 0.95  # Value of gamma
n_ep_running_average = 50  # Running average of 50 episodes
m = len(env.action_space.high)  # dimensionality of the action

# Reward
episode_reward_list = []  # Used to save episodes reward
episode_number_of_steps = []

# Agent initialization
agent = RandomAgent(m)

# Training process
EPISODES = trange(N_episodes, desc='Episode: ', leave=True)

for i in EPISODES:
    # Reset enviroment data
    done = False
    state = env.reset()
    total_episode_reward = 0.
    t = 0
    while not done:
        # Take a random action
        action = agent.forward(state)

        # Get next state and reward.  The done variable
def main():
    dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using", dev)

    # Import and initialize Mountain Car Environment
    env = gym.make('LunarLanderContinuous-v2')
    env.reset()

    # Parameters
    N_episodes = 300  # Number of episodes to run for training
    discount_factor = 0.99  # Value of gamma
    n_ep_running_average = 50  # Running average of 50 episodes
    dim_state = len(env.observation_space.high)  # State dimensionality
    m = len(env.action_space.high)  # dimensionality of the action
    lr_actor = 5 * pow(10, -5)  # Actor network learning rate
    lr_critic = 5 * pow(10, -4)  # Critic network learning rate
    d = 2  # Policy update frequency
    tau = pow(10, -3)  # Tau constant
    mu = 0.15
    sigma = 0.2

    # Reward
    episode_reward_list = []  # Used to save episodes reward
    episode_number_of_steps = []

    # Random agent initialization
    agent = AgentQ(m, dim_state, lr_actor, lr_critic, N_episodes,
                   discount_factor, mu, sigma, tau, dev)

    # Initialize Buffer
    buffer = ExperienceReplayBuffer(maximum_length=L)
    random_agent = RandomAgent(m)
    state = env.reset()
    for _ in tqdm(range(L)):
        # Take a random action
        action = random_agent.forward(state)  # Compute a random action
        next_state, reward, done, _ = env.step(action)
        experience = (state, action, reward, next_state, done
                      )  # Create the experience
        buffer.append(experience)  # Append the experience to the buffer
        state = next_state
        if done:
            state = env.reset()

    # Training process
    EPISODES = trange(N_episodes, desc='Episode: ', leave=True)

    for i in EPISODES:
        # Reset enviroment data and initialize variables
        done = False
        state = env.reset()
        total_episode_reward = 0.
        t = 0
        agent.n = np.zeros(agent.m)  # Reset noise in each episode

        while not done:
            # Take a random action
            action = agent.forward(state, None,
                                   grad=False)  # Compute possible actions

            # Get next state and reward. The done variable
            # will be True if you reached the goal position,
            # False otherwise
            next_state, reward, done, _ = env.step(action)
            experience = (state, action, reward, next_state, done
                          )  # Create the experience
            buffer.append(experience)  # Append the experience to the buffer

            if len(buffer) >= N:
                # Sample N elements from the buffer
                states, actions, rewards, next_states, dones = buffer.sample_batch(
                    n=N)
                actions = torch.tensor(actions,
                                       dtype=torch.float32,
                                       device=dev)
                mask = torch.tensor(np.multiply(dones, 1),
                                    device=dev).reshape(-1, 1)
                Q_prime = agent.forward_target(next_states)
                rewards_tensor = torch.tensor(rewards,
                                              device=dev).reshape(-1, 1)
                targets = (rewards_tensor +
                           (1 - mask) * discount_factor * Q_prime).type(
                               torch.float32)
                values = agent.forward(states, actions, grad=True)
                agent.backward(values, targets)

                if t % d == 0:
                    agent.policy_backward(states)

            # Update episode reward
            total_episode_reward += reward
            # Update state for next iteration
            state = next_state
            t += 1
            agent.noise()  # Update noise

        # Append episode reward and total number of steps
        episode_reward_list.append(total_episode_reward)
        episode_number_of_steps.append(t)

        # Close environment
        env.close()

        # Updates the tqdm update bar with fresh information
        # (episode number, total reward of the last episode, total number of Steps
        # of the last episode, average reward, average number of steps)
        EPISODES.set_description(
            "Episode {} - Reward/Steps: {:.1f}/{} - Avg. Reward/Steps: {:.1f}/{}"
            .format(
                i, total_episode_reward, t,
                running_average(episode_reward_list, n_ep_running_average)[-1],
                running_average(episode_number_of_steps,
                                n_ep_running_average)[-1]))

    # Save network
    torch.save(agent.actor_network, 'neural-network-2-actor.pth')
    torch.save(agent.critic_network, 'neural-network-2-critic.pth')

    # Plot Rewards and steps
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 9))
    ax[0].plot([i for i in range(1, N_episodes + 1)],
               episode_reward_list,
               label='Episode reward')
    ax[0].plot([i for i in range(1, N_episodes + 1)],
               running_average(episode_reward_list, n_ep_running_average),
               label='Avg. episode reward')
    ax[0].set_xlabel('Episodes')
    ax[0].set_ylabel('Total reward')
    ax[0].set_title('Total Reward vs Episodes')
    ax[0].legend()
    ax[0].grid(alpha=0.3)

    ax[1].plot([i for i in range(1, N_episodes + 1)],
               episode_number_of_steps,
               label='Steps per episode')
    ax[1].plot([i for i in range(1, N_episodes + 1)],
               running_average(episode_number_of_steps, n_ep_running_average),
               label='Avg. number of steps per episode')
    ax[1].set_xlabel('Episodes')
    ax[1].set_ylabel('Total number of steps')
    ax[1].set_title('Total number of steps vs Episodes')
    ax[1].legend()
    ax[1].grid(alpha=0.3)
    plt.savefig('Result_problem2.png')
    plt.show()