Beispiel #1
0
                        t.rewards = torch.cat(
                            (t.rewards, torch.Tensor([gamma**(j + 1) * reward
                                                      ]).reshape(1, -1)))
                        imagine_state = imagine_next_state

                    imagine_action = actor.target_net(imagine_state).reshape(
                        1, -1)
                    bootstrap_Q = gamma**(params['imagination_steps'] +
                                          1) * critic.target_net(
                                              imagine_state, imagine_action)

                target = torch.stack([
                    t.rewards[i:].sum() + bootstrap_Q
                    for i in range(len(t.rewards))
                ]).reshape(-1, 1)
                current = critic.net(t.states, t.actions)
                critic_loss = critic_loss_fnc(target, current)
                wandb.log(
                    {
                        "value_loss": critic_loss,
                        'step': global_step,
                        'episode': episode
                    },
                    commit=False)
                critic.optimise(critic_loss)

                critic.soft_target_update()
                actor.soft_target_update()

    wandb.log({"episode_reward": episode_reward, 'episode': episode})
Beispiel #2
0
                           - alpha * target_action_log
            target = t.rewards + gamma * (1 - t.terminals) * target_Q
            current_v1 = critic1.net(t.states, t.actions)
            current_v2 = critic2.net(t.states, t.actions)
            critic_loss1 = critic_loss_fnc(target, current_v1)
            critic_loss2 = critic_loss_fnc(target, current_v2)
            wandb.log(
                {
                    "value_loss": (critic_loss1 + critic_loss2) / 2,
                    'step': global_step,
                    'episode': episode
                },
                commit=False)
            critic1.optimise(critic_loss1)
            critic2.optimise(critic_loss2)
            critic1.soft_target_update()
            critic2.soft_target_update()

            policy_action, policy_action_log = actor.action_selection(t.states)
            Q_min = torch.min(critic1.net(t.states, policy_action),
                              critic2.net(t.states, policy_action))
            actor_loss = (Q_min - alpha * policy_action_log).mean()
            wandb.log(
                {
                    "policy_loss": actor_loss,
                    'step': global_step,
                    'episode': episode
                },
                commit=False)
            actor.optimise(-actor_loss)