Ejemplo n.º 1
0
def train_policy(optimizer: Optimizer, training_info: TrainingInfo,
                 run_params: RunParams):
    """ Trains the policy using the policy gradient method, given the discounted rewards of the latest episode
    Entropy is also taken into account. Each new episode diminishes its importance by run_params.entropy_decay,
    such that the agent will explore at the beginning and tend to explore less and less over time. The agent is
    trained once on all the transitions of the episode (instead of training many times over mini-batches).
    """
    training_info.compute_discounted_rewards()

    # Compute the loss of the policy at each time step
    policy_losses = []
    for log_prob, discounted_reward, entropy in zip(
            training_info.log_probs, training_info.discounted_rewards,
            training_info.entropies):
        entropy_coeff = run_params.entropy_coeff * run_params.entropy_decay**training_info.episode_number
        policy_losses.append(-(log_prob + entropy_coeff * entropy) *
                             discounted_reward)

    # Optimize the policy
    optimizer.zero_grad()
    total_policy_loss = torch.cat(policy_losses).sum()
    total_policy_loss.backward()
    optimizer.step()

    # Reset the state of the episode
    training_info.reset()
Ejemplo n.º 2
0
def train_policy_on_episode(optimizer: Optimizer, training_info: TrainingInfo,
                            episode_number: int):
    """ Trains both the actor and the critic using all transitions of the latest episode. The actor's loss is the MSE
     between V(state) and reward + gamma * V(next state), where V indicates the actor's value function.
     The actor / policy is trained by maximizing the log probability * td-error, and an entropy term is
     added to encourage exploration. The entropy is decayed at new each episode by the run_params.entropy_decay
     coefficient.
    """
    training_info.compute_discounted_rewards()

    # Compute the loss of the policy and the critic at each time step
    policy_losses = []  # Policy errors
    value_losses = []  # Critic errors
    for log_prob, discounted_reward, state_value, entropy in zip(
            training_info.log_probs, training_info.discounted_rewards,
            training_info.state_values, training_info.entropies):
        advantage = discounted_reward - state_value.item()
        policy_losses.append(-(log_prob + 0.99**episode_number * entropy) *
                             advantage)
        value_losses.append(
            F.smooth_l1_loss(state_value.squeeze(0),
                             torch.tensor([discounted_reward])))

    # Optimize the policy
    optimizer.zero_grad()
    total_policy_loss = torch.stack(policy_losses).sum() + torch.stack(
        value_losses).sum()
    total_policy_loss.backward()
    optimizer.step()

    # Reset the state of the episode
    training_info.reset()
Ejemplo n.º 3
0
def train_policy_batches(policy: SimplePolicyContinuous, optimizer: Optimizer,
                         training_info: TrainingInfo, run_params: RunParams):
    """ Trains the policy using the policy gradient method, given the discounted rewards of the latest episode
    Entropy is also taken into account. Each new episode diminishes its importance by run_params.entropy_decay,
    such that the agent will explore at the beginning and tend to explore less and less over time. The agent is
    trained many times over mini-batches of transitions of the episode (instead of being trained once on all
    transitions)"""
    training_info.compute_discounted_rewards()

    for (states, actions, discounted_rewards) in training_info.get_batches(
            run_params.batch_size):
        train_batch(policy, states, actions, discounted_rewards, optimizer,
                    training_info.episode_number, run_params)

    training_info.reset()
Ejemplo n.º 4
0
def sac_entropy_adjustment_train(env: gym.Env, run_params: RunParams,
                                 sac_params: SacEntropyAdjustmentParams):
    """ Trains the soft actor critic (SAC) on the given environment. Training is done at the end of each episode.
    Only continuous actions spaces are supported. Several features can be optionally enabled:
    1) Scaling / normalizing the states / observations
    2) Logging training statistics on Tensorboard
    3) Render the environment periodically (pick render_frequency in the RunParams)
    4) Testing the agent's performance periodically
    5) Saving the policy and value estimators to disk periodically
    6) During the first X steps, do random actions (see RunParams.num_random_action_steps)
    """
    assert run_params.continuous_actions, "SAC implementation only implemented for continuous action spaces"

    print(
        f"The goal is a running reward of at least {env.spec.reward_threshold}."
    )

    # Optimization for speed: don't compute gradients for the target networks, since we will never use them
    for network in [
            sac_params.policy_target, sac_params.value_estimator1_target,
            sac_params.value_estimator2_target
    ]:
        for parameter in network.parameters():
            parameter.requires_grad = False

    # Setup tensorboard
    writer = run_params.get_tensorboard_writer(
        env) if run_params.use_tensorboard else None

    # Setup scaler, training info and replay buffer
    scaler = setup_observation_scaler(
        env) if run_params.should_scale_states else None
    training_info = TrainingInfo(GAMMA=run_params.gamma)
    replay_buffer = ReplayBuffer(sac_params.replay_buffer_size)

    training_step_number, step_number, test_episode_num = 0, 0, 0
    max_episode_steps = env.spec.max_episode_steps

    for episode_number in range(run_params.maximum_episodes):
        state = env.reset()
        episode_length = 0

        # Update policy bounds
        # sac_params.policy.action_high = sac_params.policy_target.action_high = torch.tensor(env.action_space.high)
        # sac_params.policy.action_low = sac_params.policy_target.action_low = torch.tensor(env.action_space.low)

        # Do a whole episode
        for t in range(max_episode_steps):
            if run_params.should_scale_states:
                state = scale_state(scaler, state)

            # Pick an action, execute and observe the results
            # Note: in the first start_steps steps, we randomly pick actions from
            # the action space (uniformly) to have better exploration.
            if step_number >= sac_params.num_random_action_steps:
                action, log_prob = select_action_sac(state,
                                                     sac_params,
                                                     compute_log_prob=True)
            else:
                action = env.action_space.sample()
                log_prob = -1

            # To be sure that actions are in the action space (see watershed.py)
            action = np.clip(action, env.action_space.low,
                             env.action_space.high)

            # For debugging, log the Q-values
            if run_params.use_tensorboard:
                if random.random(
                ) < 0.02:  # Don't log too often to avoid slowing things down
                    s, a = torch.tensor(state).float(), torch.tensor(
                        action).float()
                    value1 = sac_params.value_estimator1.forward(s, a)
                    value2 = sac_params.value_estimator2.forward(s, a)
                    value1_target = sac_params.value_estimator1_target.forward(
                        s, a)
                    value2_target = sac_params.value_estimator2_target.forward(
                        s, a)

                    for action_index in range(a.shape[0]):
                        writer.add_scalar(f"Action/{action_index}",
                                          a[action_index], step_number)
                    writer.add_scalar("Q-values/Normal Network 1", value1,
                                      step_number)
                    writer.add_scalar("Q-values/Normal Network 2", value2,
                                      step_number)
                    writer.add_scalar("Q-values/Target Network 1",
                                      value1_target, step_number)
                    writer.add_scalar("Q-values/Target Network 2",
                                      value2_target, step_number)
                    writer.add_scalar("Action/Log prob action", log_prob,
                                      step_number)

            new_state, reward, done, _ = env.step(action)

            # Render the environment if wanted
            if run_params.should_render(episode_number):
                env.render()

            # Store reward and updates the running reward
            training_info.record_step(state, action, reward)

            # Add the transition to the replay buffer
            new_state_scaled = scale_state(
                scaler,
                new_state) if run_params.should_scale_states else new_state
            replay_buffer.store(state, action, reward, new_state_scaled, done
                                and t < max_episode_steps - 1)

            state = new_state
            if done:
                break

            step_number += 1
            episode_length += 1

        # Training at the end of the episode approach taken from
        # https://github.com/createamind/DRL/blob/master/spinup/algos/sac1/sac1_BipedalWalker-v2_200ep.py
        for update_step in range(int(episode_length * 1.5)):
            batch_transitions = replay_buffer.sample_batch(
                sac_params.batch_size)
            update_models(batch_transitions, sac_params, run_params, writer,
                          training_step_number)
            training_step_number += 1

        if (episode_number + 0) % sac_params.test_frequency == 0:
            test_agent_performance(env, sac_params, run_params, writer,
                                   test_episode_num, scaler)
            test_episode_num += 1

        if run_params.should_save_model(episode_number):
            save_model_sac(env, sac_params, scaler)

        training_info.update_running_reward(rate=0.01)

        # Add some logging
        log_on_console(env, episode_number, reward, run_params, t + 1,
                       training_info)
        log_on_tensorboard(env, episode_number, reward, run_params, t + 1,
                           training_info, writer)

        # Check if we have solved the environment reliably
        if run_params.stop_at_threshold and env.spec.reward_threshold is not None and training_info.running_reward > env.spec.reward_threshold:
            print(
                f"Solved! The running reward is {training_info.running_reward:.2f}, which is above the threshold of "
                f"{env.spec.reward_threshold}. The last episode ran for {t} steps."
            )
            save_model_sac(env, sac_params, scaler)
            break

        training_info.reset()

    close_tensorboard(run_params, writer)
Ejemplo n.º 5
0
def actor_critic_train_per_step(
        policy: SimplePolicyContinuous,
        critic: SimpleCritic,
        env: gym.Env,
        optimizer: Optimizer,
        run_params: RunParams,
        lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None):
    """ Trains the actor critic on the given environment. Training is done at the end of each step, instead of
    at the end of each episode. This means the agent trains less frequently than doing it at each step.
    Both discrete and continuous actions spaces are supported. Several features can be optionally enabled:
    1) Scaling / normalizing the states / observations
    2) Logging training statistics on Tensorboard
    3) Render the environment periodically (pick render_frequency in the RunParams)
    4) Using a learning rate scheduler
    """
    training_info = TrainingInfo(GAMMA=run_params.gamma)
    print(
        f"The goal is a running reward of at least {env.spec.reward_threshold}."
    )

    if run_params.should_scale_states:
        scaler = setup_observation_scaler(env)

    writer = run_params.get_tensorboard_writer(
        env) if run_params.use_tensorboard else None

    for episode_number in itertools.count(
            1):  # itertools.count() is basically range(+infinity)
        state = env.reset()

        # Do a whole episode (upto 10000 steps, don't want infinite steps)
        for t in range(env.spec.max_episode_steps):
            scaled_state = scale_state(
                scaler, state) if run_params.should_scale_states else state
            if run_params.continuous_actions:
                action = select_action_continuous(scaled_state, policy,
                                                  training_info, env)
            else:
                action = select_action_discrete(scaled_state, policy,
                                                training_info)

            state_value = get_state_value(scaled_state, critic)

            new_state, reward, done, _ = env.step(action)
            if run_params.should_render(episode_number):
                env.render()

            scaled_new_state = scale_state(
                scaler,
                new_state) if run_params.should_scale_states else new_state
            training_info.record_step(scaled_state, action, reward,
                                      state_value)
            train_policy_on_step(critic, optimizer, reward, scaled_state,
                                 scaled_new_state, training_info.GAMMA,
                                 training_info.log_probs[-1],
                                 training_info.entropies[-1], episode_number,
                                 run_params)

            state = new_state
            if done:
                break

        training_info.update_running_reward()

        # Add some logging
        log_on_console(env, episode_number, reward, run_params, t,
                       training_info)
        log_on_tensorboard(env, episode_number, reward, run_params, t,
                           training_info, writer)

        # Check if we have solved the environment reliably
        if env.spec.reward_threshold is not None and training_info.running_reward > env.spec.reward_threshold:
            print(
                f"Solved! The running reward is {training_info.running_reward:.2f}, which is above the threshold of "
                f"{env.spec.reward_threshold}. The last episode ran for {t} steps."
            )
            break

        training_info.reset()

        if lr_scheduler:
            lr_scheduler.step(episode_number)

    close_tensorboard(run_params, writer)
Ejemplo n.º 6
0
def ddpg_train(env: gym.Env, run_params: RunParams, ddpg_params: DDPGParams):
    """
    :param env: the OpenAI gym environment
    :param run_params: the general training parameters shared by all training algorithm
    :param ddpg_params: the DDPG-specific information (networks, optimizers, parameters)
    """
    assert run_params.continuous_actions, "DDPG implementation only implemented for continuous action spaces"

    print(f"The goal is a running reward of at least {env.spec.reward_threshold}.")

    # Optimization for speed: don't compute gradients for the target networks, since we will never use them
    for network in [ddpg_params.policy_target, ddpg_params.value_estimator_target]:
        for parameter in network.parameters():
            parameter.requires_grad = False

    # Setup tensorboard
    writer = run_params.get_tensorboard_writer(env) if run_params.use_tensorboard else None

    # Setup scaler, training info and replay buffer
    scaler = setup_observation_scaler(env) if run_params.should_scale_states else None
    training_info = TrainingInfo(GAMMA=run_params.gamma)
    replay_buffer = ReplayBuffer(ddpg_params.replay_buffer_size)

    step_number, test_episode_num = 0, 0
    max_episode_steps = env.spec.max_episode_steps
    value_time_step = 0

    for episode_number in range(run_params.maximum_episodes):
        state = env.reset()

        # Do a whole episode
        for t in range(max_episode_steps):
            if run_params.should_scale_states:
                state = scale_state(scaler, state)

            # Pick an action, execute and observe the results
            # Note: in the first start_steps steps, we randomly pick actions from
            # the action space (uniformly) to have better exploration.
            if step_number >= ddpg_params.num_random_action_steps:
                action = select_action_ddpg(state, ddpg_params, env, ddpg_params.noise_coeff * 0.995 ** episode_number)
            else:
                action = env.action_space.sample()

            # For debugging, log the Q-values
            if run_params.use_tensorboard:
                s, a = torch.tensor(state).float(), torch.tensor(action).float()
                value = ddpg_params.value_estimator.forward(s, a)
                value_target = ddpg_params.value_estimator_target.forward(s, a)

                for action_index in range(a.shape[0]):
                    writer.add_scalar(f"Action/{action_index}", a[action_index], value_time_step)
                writer.add_scalar("Q-values/Normal Network", value, value_time_step)
                writer.add_scalar("Q-values/Target Network", value_target, value_time_step)

                value_time_step += 1

            new_state, reward, done, _ = env.step(action)

            # Render the environment if wanted
            if run_params.should_render(episode_number):
                env.render()

            # Store reward and updates the running reward
            training_info.record_step(state, action, reward)

            # Add the transition to the replay buffer
            new_state_scaled = scale_state(scaler, new_state) if run_params.should_scale_states else new_state
            replay_buffer.store(state, action, reward, new_state_scaled, done and t < max_episode_steps - 1)

            state = new_state
            if done:
                break

            if step_number >= ddpg_params.update_start and step_number % ddpg_params.update_frequency == 0:
                for update_step in range(ddpg_params.update_frequency):
                    batch_transitions = replay_buffer.sample_batch(ddpg_params.batch_size)
                    update_models(batch_transitions, ddpg_params, run_params, writer, step_number)

            step_number += 1

        if episode_number % ddpg_params.test_frequency == 0:
            test_agent_performance(env, ddpg_params, run_params, writer, test_episode_num, scaler)
            test_episode_num += 1

        if run_params.should_save_model(episode_number):
            save_model_ddpg(ddpg_params, env, scaler)

        training_info.update_running_reward()

        # Add some logging
        log_on_console(env, episode_number, reward, run_params, t, training_info)
        log_on_tensorboard(env, episode_number, reward, run_params, t, training_info, writer)

        # Check if we have solved the environment reliably
        if run_params.stop_at_threshold and env.spec.reward_threshold is not None and training_info.running_reward > env.spec.reward_threshold:
            print(f"Solved! The running reward is {training_info.running_reward:.2f}, which is above the threshold of "
                  f"{env.spec.reward_threshold}. The last episode ran for {t} steps.")
            break

        training_info.reset()
        ddpg_params.noise_source.reset()

    close_tensorboard(run_params, writer)