def test_no_nan():
    """Test for no nans in all parameters.

    The main takeaway from this test is that you must set the learning
    rates low or else the parameters will tend to nan.

    """

    env = gym.make("InvertedPendulum-v2")
    a = TanhGaussianActor(env.observation_space, env.action_space, [256, 256])
    c = DoubleQCritic(env.observation_space, env.action_space, [256, 256])
    s = SAC(actor=a,
            critic=c,
            _device="cuda",
            _act_dim=len(env.action_space.low))

    batch = create_batch(env)
    for t in range(200):
        print("iteration", t)
        s.update(batch, t)
        for key, v in a.state_dict().items():
            print("actor", key)
            assert torch.any(torch.isnan(v)) == False
        for key, v in c.state_dict().items():
            print("actor", key)
            assert torch.any(torch.isnan(v)) == False
Ejemplo n.º 2
0
def test_random_space():
    for _ in range(100):
        obs_spec = create_random_space()
        act_spec = create_random_space()
        print(obs_spec)
        print(act_spec)
        c1 = DoubleQCritic(obs_spec, act_spec, [60, 50])
        c2 = DoubleVCritic(obs_spec, [60, 50])
        obs = torchify(obs_spec.sample())
        act = torchify(act_spec.sample())
        c1.forward(obs, act)
        c2.forward(obs)
Ejemplo n.º 3
0
def test_integration():
    obs_spec = Box(low=np.zeros(10, dtype=np.float32),
                   high=np.ones(10, dtype=np.float32))
    act_spec = Box(low=np.zeros(3, dtype=np.float32),
                   high=np.ones(3, dtype=np.float32))
    c1 = DoubleQCritic(obs_spec, act_spec, [60, 50])
    c2 = DoubleVCritic(obs_spec, [60, 50])
    obs = torch.rand((100, 10))
    act = torch.rand((100, 3))
    q = c1.forward(obs, act)
    assert q.shape == (100, 1)
    v = c2.forward(obs)
    assert v.shape == (100, 1)
def test_critic_target_update():
    env = gym.make("InvertedPendulum-v2")
    a = TanhGaussianActor(env.observation_space, env.action_space, [256, 256])
    c = DoubleQCritic(env.observation_space, env.action_space, [256, 256])
    s = SAC(
        actor=a,
        critic=c,
        _device="cuda",
        _act_dim=len(env.action_space.low),
        _critic_target_update_frequency=200,
    )

    batch = create_batch(env)
    cp_before = s.critic_target.state_dict()
    for t in range(100):
        s.update(batch, t + 1)
        cp_after = s.critic_target.state_dict()

    for k, v in cp_before.items():
        v2 = cp_after[k]
        assert torch.all(v == v2)

    cp_before = {k: v.clone() for k, v in s.critic_target.state_dict().items()}
    s.update(batch, 1000)
    cp_after = s.critic_target.state_dict()

    for k, v in cp_before.items():
        v2 = cp_after[k]
        assert not torch.all(v == v2)
Ejemplo n.º 5
0
    def __init__(
        self,
        actor: GaussianActor,
        critic: DoubleQCritic,
        _device: str,
        _act_dim: int,
        _critic_tau: float = 5e-3,
        _discount: float = 0.99,
        _init_temperature: float = 0.1,
        _learnable_temperature: bool = True,
        _actor_update_frequency: int = 1,
        _critic_target_update_frequency: int = 1,
        _optimizer_type: str = "adam",
        _alpha_lr: float = 3e-4,
        _actor_lr: float = 3e-4,
        _actor_weight_decay: float = 1e-2,
        _critic_lr: float = 3e-4,
    ) -> None:

        super().__init__()

        # set other parameters
        self.device = torch.device(_device)
        self.discount = _discount
        self.critic_tau = _critic_tau
        self.actor_update_frequency = _actor_update_frequency
        self.critic_target_update_frequency = _critic_target_update_frequency
        self.learnable_temperature = _learnable_temperature

        # instantiate actor and critic
        self.actor = actor.to(self.device)
        self.critic = critic.to(self.device)
        self.critic_target = copy.deepcopy(critic)
        self.critic_target.eval()

        # instantiate log alpha
        self.log_alpha = torch.tensor(np.log(_init_temperature)).to(
            self.device)
        self.log_alpha.requires_grad = True
        self.target_entropy = -_act_dim

        # optimizers
        if _optimizer_type == "adam":
            optimizer = torch.optim.Adam
        elif _optimizer_type == "sgd":
            optimizer = torch.optim.SGD
        else:
            raise NotImplementedError()

        self.actor_optimizer = optimizer(self.actor.parameters(),
                                         lr=_actor_lr,
                                         weight_decay=_actor_weight_decay)
        self.critic_optimizer = optimizer(self.critic.parameters(),
                                          lr=_critic_lr)
        self.log_alpha_optimizer = optimizer([self.log_alpha], lr=_alpha_lr)
def test_actor_loss_decrease():
    env = gym.make("InvertedPendulum-v2")
    a = TanhGaussianActor(env.observation_space, env.action_space, [256, 256])
    c = DoubleQCritic(env.observation_space, env.action_space, [256, 256])
    s = SAC(actor=a,
            critic=c,
            _device="cuda",
            _act_dim=len(env.action_space.low))

    batch = create_batch(env)
    batch = {"obs": batch["obs"]}
    s.update_actor_and_alpha(**batch)
    loss_before = s.log_local_epoch()["actor/loss"]
    for _ in range(200):
        s.update_actor_and_alpha(**batch)
    loss_after = s.log_local_epoch()["actor/loss"]
    assert loss_after < loss_before + 0.2
def test_integration():
    env = gym.make("InvertedPendulum-v2")

    for device in "cpu", "cuda":
        a = TanhGaussianActor(env.observation_space, env.action_space,
                              [256, 256])
        c = DoubleQCritic(env.observation_space, env.action_space, [256, 256])
        s = SAC(actor=a,
                critic=c,
                _device=device,
                _act_dim=len(env.action_space.low))

        print(flatten(s.log_hyperparams()).keys())

        batch = create_batch(env, device)
        for t in range(10):
            s.update(batch, t)

        print(flatten(s.log_epoch()).keys())
def test_critic_value_increase():
    env = gym.make("InvertedPendulum-v2")
    a = TanhGaussianActor(env.observation_space, env.action_space, [256, 256])
    c = DoubleQCritic(env.observation_space, env.action_space, [256, 256])
    s = SAC(actor=a,
            critic=c,
            _device="cuda",
            _act_dim=len(env.action_space.low))

    batch = create_batch(env)

    s.update_critic(**batch)
    q1_before = s.log_local_epoch()["critic/q1"].mean()
    q2_before = s.log_local_epoch()["critic/q2"].mean()
    for _ in range(200):
        s.update_critic(**batch)
    q1_after = s.log_local_epoch()["critic/q1"].mean()
    q2_after = s.log_local_epoch()["critic/q2"].mean()
    assert q1_after > q1_before - 0.2
    assert q2_after > q2_before - 0.2