Ejemplo n.º 1
0
def train_policy(optimizer: Optimizer, training_info: TrainingInfo,
                 run_params: RunParams):
    """ Trains the policy using the policy gradient method, given the discounted rewards of the latest episode
    Entropy is also taken into account. Each new episode diminishes its importance by run_params.entropy_decay,
    such that the agent will explore at the beginning and tend to explore less and less over time. The agent is
    trained once on all the transitions of the episode (instead of training many times over mini-batches).
    """
    training_info.compute_discounted_rewards()

    # Compute the loss of the policy at each time step
    policy_losses = []
    for log_prob, discounted_reward, entropy in zip(
            training_info.log_probs, training_info.discounted_rewards,
            training_info.entropies):
        entropy_coeff = run_params.entropy_coeff * run_params.entropy_decay**training_info.episode_number
        policy_losses.append(-(log_prob + entropy_coeff * entropy) *
                             discounted_reward)

    # Optimize the policy
    optimizer.zero_grad()
    total_policy_loss = torch.cat(policy_losses).sum()
    total_policy_loss.backward()
    optimizer.step()

    # Reset the state of the episode
    training_info.reset()
Ejemplo n.º 2
0
def train_policy_on_episode(optimizer: Optimizer, training_info: TrainingInfo,
                            episode_number: int):
    """ Trains both the actor and the critic using all transitions of the latest episode. The actor's loss is the MSE
     between V(state) and reward + gamma * V(next state), where V indicates the actor's value function.
     The actor / policy is trained by maximizing the log probability * td-error, and an entropy term is
     added to encourage exploration. The entropy is decayed at new each episode by the run_params.entropy_decay
     coefficient.
    """
    training_info.compute_discounted_rewards()

    # Compute the loss of the policy and the critic at each time step
    policy_losses = []  # Policy errors
    value_losses = []  # Critic errors
    for log_prob, discounted_reward, state_value, entropy in zip(
            training_info.log_probs, training_info.discounted_rewards,
            training_info.state_values, training_info.entropies):
        advantage = discounted_reward - state_value.item()
        policy_losses.append(-(log_prob + 0.99**episode_number * entropy) *
                             advantage)
        value_losses.append(
            F.smooth_l1_loss(state_value.squeeze(0),
                             torch.tensor([discounted_reward])))

    # Optimize the policy
    optimizer.zero_grad()
    total_policy_loss = torch.stack(policy_losses).sum() + torch.stack(
        value_losses).sum()
    total_policy_loss.backward()
    optimizer.step()

    # Reset the state of the episode
    training_info.reset()
Ejemplo n.º 3
0
def train_policy_batches(policy: SimplePolicyContinuous, optimizer: Optimizer,
                         training_info: TrainingInfo, run_params: RunParams):
    """ Trains the policy using the policy gradient method, given the discounted rewards of the latest episode
    Entropy is also taken into account. Each new episode diminishes its importance by run_params.entropy_decay,
    such that the agent will explore at the beginning and tend to explore less and less over time. The agent is
    trained many times over mini-batches of transitions of the episode (instead of being trained once on all
    transitions)"""
    training_info.compute_discounted_rewards()

    for (states, actions, discounted_rewards) in training_info.get_batches(
            run_params.batch_size):
        train_batch(policy, states, actions, discounted_rewards, optimizer,
                    training_info.episode_number, run_params)

    training_info.reset()