Exemple #1
0
        _, next_value_p1 = agent1(rollout1.states[-1])
        next_value_p1 = next_value_p1.data

        # v(s_t+1) player2
        _, next_value_p2 = agent2(rollout2.states[-1])
        next_value_p2 = next_value_p2.data

        # n-step returns player1
        returns_p1 = rollout1.compute_returns(next_value_p1, gamma)

        # n-step returns player2
        returns_p2 = rollout2.compute_returns(next_value_p2, gamma)

        # eval actions player1
        logit_p1, action_log_probs_p1, values_p1, entropy_p1 = agent1.evaluate_actions(
            rollout1.states[:-1].view(-1, *state_shape),
            rollout1.actions.view(-1, 1)
        )

        # eval actions player2
        logit_p2, action_log_probs_p2, values_p2, entropy_p2 = agent2.evaluate_actions(
            rollout2.states[:-1].view(-1, *state_shape),
            rollout2.actions.view(-1, 1)
        )

        # advantages player1
        values_p1 = values_p1.view(num_steps, num_envs, 1)
        action_log_probs_p1 = action_log_probs_p1.view(num_steps, num_envs, 1)        
        advantages_p1 = returns_p1 - values_p1

        value_loss_p1 = advantages_p1.pow(2).mean()        
        action_loss_p1 = -(advantages_p1.data * action_log_probs_p1).mean()
Exemple #2
0
            final_rewards *= finished_masks
            final_rewards += (1-finished_masks) * episode_rewards                       
                                                              
            episode_rewards *= finished_masks
                                                                       
            finished_masks = make_cuda(finished_masks)

            state = torch.FloatTensor(np.float32(next_state))
            rollout.insert(step, state, action.data, reward, finished_masks)
        
        _, next_value = actor_critic(rollout.states[-1])
        next_value = next_value.data
       
        returns = rollout.compute_returns(next_value, gamma)      
        logit, action_log_probs, values, entropy = actor_critic.evaluate_actions(
            rollout.states[:-1].view(-1, *state_shape),
            rollout.actions.view(-1, 1)
        )

        values = values.view(num_steps, num_envs, 1)
        action_log_probs = action_log_probs.view(num_steps, num_envs, 1)        
        advantages = returns - values

        value_loss = advantages.pow(2).mean()        
        action_loss = -(advantages.data * action_log_probs).mean()

        optimizer.zero_grad()        
        loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef
        
        loss.backward()
        nn.utils.clip_grad_norm_(actor_critic.parameters(), max_grad_norm)
        optimizer.step()        
            current_state = next_state
            rollout.insert(step, current_state, action.data, reward, masks)

        with torch.no_grad():
            _, next_value = ei_i2a(rollout.states[-1])
        next_value = next_value.data

        returns = rollout.compute_returns(next_value, gamma)

        logit, action_log_probs, values, entropy = ei_i2a.evaluate_actions(
            rollout.states[:-1].view(-1, *state_shape),
            rollout.actions.view(-1, 1))

        distil_logit, _, _, _ = distill_policy.evaluate_actions(
            rollout.states[:-1].view(-1, *state_shape),
            rollout.actions.view(-1, 1))

        distil_loss = 0.01 * (F.softmax(logit, dim=1).detach() * F.log_softmax(
            distil_logit, dim=1)).sum(1).mean()

        values = values.view(num_steps, num_envs, 1)
        action_log_probs = action_log_probs.view(num_steps, num_envs, 1)
        advantages = returns - values

        value_loss = advantages.pow(2).mean()
        action_loss = -(advantages.data * action_log_probs).mean()

        optimizer.zero_grad()
        loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef
        loss.backward()
            masks = masks.cuda()

        current_state = torch.FloatTensor(np.float32(next_state))
        rollout.insert(step, current_state, action.data, reward, masks)

    _, next_value = actor_critic(Variable(rollout.states[-1], volatile=True))
    next_value = next_value.data

    returns = rollout.compute_returns(next_value, gamma)

    logit, action_log_probs, values, entropy = actor_critic.evaluate_actions(
        Variable(rollout.states[:-1]).view(-1, *state_shape),
        Variable(rollout.actions).view(-1, 1))

    distil_logit, _, _, _ = distil_policy.evaluate_actions(
        Variable(rollout.states[:-1]).view(-1, *state_shape),
        Variable(rollout.actions).view(-1, 1))

    distil_loss = 0.01 * (F.softmax(logit).detach() *
                          F.log_softmax(distil_logit)).sum(1).mean()

    values = values.view(num_steps, num_envs, 1)
    action_log_probs = action_log_probs.view(num_steps, num_envs, 1)
    advantages = Variable(returns) - values

    value_loss = advantages.pow(2).mean()
    action_loss = -(Variable(advantages.data) * action_log_probs).mean()

    optimizer.zero_grad()
    loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef
    loss.backward()