def reinforce_step(policy_net,
                   optimizer_policy,
                   states,
                   actions,
                   rewards,
                   masks,
                   gamma,
                   eps=1e-6):
    """calculate cumulative reward"""
    cum_rewards = DOUBLE(rewards.size(0), 1).to(device)
    pre_value = 0
    for i in reversed(range(rewards.size(0))):
        pre_value = gamma * masks[i] * pre_value + rewards[i, 0]
        cum_rewards[i, 0] = pre_value

    # normalize cumulative rewards
    cum_rewards = (cum_rewards - cum_rewards.mean()) / (cum_rewards.std() +
                                                        eps)
    """update policy"""
    log_probs = policy_net.get_log_prob(states, actions)
    policy_loss = -(log_probs * cum_rewards).mean()

    optimizer_policy.zero_grad()
    policy_loss.backward()
    torch.nn.utils.clip_grad_norm_(policy_net.parameters(), 40)
    optimizer_policy.step()

    return policy_loss
Esempio n. 2
0
def estimate_advantages(rewards, masks, values, gamma, tau):
    deltas = DOUBLE(rewards.size(0), 1).to(device)
    advantages = DOUBLE(rewards.size(0), 1).to(device)

    prev_value = 0
    prev_advantage = 0
    for i in reversed(range(rewards.size(0))):
        deltas[i] = rewards[i] + gamma * prev_value * masks[i] - values[i]
        advantages[i] = deltas[i] + gamma * tau * prev_advantage * masks[i]

        prev_value = values[i, 0]
        prev_advantage = advantages[i, 0]

    returns = values + advantages
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-10)

    return advantages, returns