Beispiel #1
0
            solver.zero_grad()
            obs_l = torch.FloatTensor(obs_l).to(device)
            probs_l = torch.stack(probs_l, dim=0)
            acts_l = torch.stack(acts_l, dim=0)
            probs_a = probs_l.gather(2, acts_l).squeeze(-1)
            next_obs_in = torch.FloatTensor(s_gotten).to(device)
            next_v = critic(next_obs_in).squeeze(-1)
            values = critic(obs_l).squeeze(-1)
            td_res = compute_td(next_v.detach(), rews_l, dones_l,
                                gamma) - values

            critic_loss = td_res.pow(2).sum(dim=0)
            performance = probs_a.log() * td_res.detach()
            actor_loss = -performance.sum(dim=0)
            neg_entropy = (probs_l * probs_l.log()).sum(dim=[0, 2])

            critic_loss = (critic_loss * accept_sample).sum() / num_sample
            actor_loss = (actor_loss * accept_sample).sum() / num_sample
            neg_entropy = (neg_entropy * accept_sample).sum() / num_sample
            loss = critic_loss + actor_loss + 0.5 * neg_entropy
            loss.backward()
            solver.step()

        frame_count += 1
        if frame_count % 101 == 100:
            print(np.array(duration)[-100:].mean())

    envs.close()
    render_simulation(env_name, actor, device)
    plot_duration(duration)
Beispiel #2
0
def main():
    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    num_inputs = envs.observation_space.shape[0]
    num_outputs = envs.action_space.n

    model = ActorCritic(num_inputs, num_outputs, hidden_size,
                        hd2_size).to(device)
    optimizer = optim.Adam(model.parameters())

    max_frames = 10000
    frame_idx = 0
    test_rewards = []

    state = envs.reset()

    while frame_idx < max_frames:

        log_probs = []
        values = []
        rewards = []
        masks = []
        entropy = 0

        for _ in range(num_steps):
            state = torch.FloatTensor(state).to(device)
            dist, value = model(state)

            action = dist.sample()
            next_state, reward, done, _ = envs.step(action.cpu().numpy())

            log_prob = dist.log_prob(action)
            entropy += dist.entropy().mean()

            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

            state = next_state
            frame_idx += 1

        next_state = torch.FloatTensor(next_state).to(device)
        _, next_value = model(next_state)
        returns = compute_returns(next_value, rewards, masks)

        log_probs = torch.cat(log_probs)
        returns = torch.cat(returns).detach()
        values = torch.cat(values)

        advantage = returns - values

        actor_loss = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()

        loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy

        print(f'\rframe: {frame_idx}\t loss: {loss}', end='')
        if frame_idx % 100 == 0:
            rewards, scores = map(
                list, zip(*((test_env(model, False) for _ in range(10)))))
            avg_rewards = np.mean(rewards)
            avg_scores = np.mean(scores)
            print(
                f'\rframe: {frame_idx}\t avg_rewards: {avg_rewards:.2f}\t avg_scores: {avg_scores:.2f}\t loss: {loss}'
            )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    ((test_env(model, True) for _ in range(10)))
    envs.close()