コード例 #1
0
def test_no_nan():
    """Test for no nans in all parameters.

    The main takeaway from this test is that you must set the learning
    rates low or else the parameters will tend to nan.

    """

    env = gym.make("InvertedPendulum-v2")
    a = TanhGaussianActor(env.observation_space, env.action_space, [256, 256])
    c = DoubleQCritic(env.observation_space, env.action_space, [256, 256])
    s = SAC(actor=a,
            critic=c,
            _device="cuda",
            _act_dim=len(env.action_space.low))

    batch = create_batch(env)
    for t in range(200):
        print("iteration", t)
        s.update(batch, t)
        for key, v in a.state_dict().items():
            print("actor", key)
            assert torch.any(torch.isnan(v)) == False
        for key, v in c.state_dict().items():
            print("actor", key)
            assert torch.any(torch.isnan(v)) == False
コード例 #2
0
def test_random_space():
    for _ in range(100):
        obs_spec = create_random_space()
        act_spec = create_random_space()
        print(obs_spec)
        print(act_spec)
        c1 = DoubleQCritic(obs_spec, act_spec, [60, 50])
        c2 = DoubleVCritic(obs_spec, [60, 50])
        obs = torchify(obs_spec.sample())
        act = torchify(act_spec.sample())
        c1.forward(obs, act)
        c2.forward(obs)
コード例 #3
0
def test_integration():
    obs_spec = Box(low=np.zeros(10, dtype=np.float32),
                   high=np.ones(10, dtype=np.float32))
    act_spec = Box(low=np.zeros(3, dtype=np.float32),
                   high=np.ones(3, dtype=np.float32))
    c1 = DoubleQCritic(obs_spec, act_spec, [60, 50])
    c2 = DoubleVCritic(obs_spec, [60, 50])
    obs = torch.rand((100, 10))
    act = torch.rand((100, 3))
    q = c1.forward(obs, act)
    assert q.shape == (100, 1)
    v = c2.forward(obs)
    assert v.shape == (100, 1)
コード例 #4
0
def test_critic_target_update():
    env = gym.make("InvertedPendulum-v2")
    a = TanhGaussianActor(env.observation_space, env.action_space, [256, 256])
    c = DoubleQCritic(env.observation_space, env.action_space, [256, 256])
    s = SAC(
        actor=a,
        critic=c,
        _device="cuda",
        _act_dim=len(env.action_space.low),
        _critic_target_update_frequency=200,
    )

    batch = create_batch(env)
    cp_before = s.critic_target.state_dict()
    for t in range(100):
        s.update(batch, t + 1)
        cp_after = s.critic_target.state_dict()

    for k, v in cp_before.items():
        v2 = cp_after[k]
        assert torch.all(v == v2)

    cp_before = {k: v.clone() for k, v in s.critic_target.state_dict().items()}
    s.update(batch, 1000)
    cp_after = s.critic_target.state_dict()

    for k, v in cp_before.items():
        v2 = cp_after[k]
        assert not torch.all(v == v2)
コード例 #5
0
    def __init__(
        self,
        actor: GaussianActor,
        critic: DoubleQCritic,
        _device: str,
        _act_dim: int,
        _critic_tau: float = 5e-3,
        _discount: float = 0.99,
        _init_temperature: float = 0.1,
        _learnable_temperature: bool = True,
        _actor_update_frequency: int = 1,
        _critic_target_update_frequency: int = 1,
        _optimizer_type: str = "adam",
        _alpha_lr: float = 3e-4,
        _actor_lr: float = 3e-4,
        _actor_weight_decay: float = 1e-2,
        _critic_lr: float = 3e-4,
    ) -> None:

        super().__init__()

        # set other parameters
        self.device = torch.device(_device)
        self.discount = _discount
        self.critic_tau = _critic_tau
        self.actor_update_frequency = _actor_update_frequency
        self.critic_target_update_frequency = _critic_target_update_frequency
        self.learnable_temperature = _learnable_temperature

        # instantiate actor and critic
        self.actor = actor.to(self.device)
        self.critic = critic.to(self.device)
        self.critic_target = copy.deepcopy(critic)
        self.critic_target.eval()

        # instantiate log alpha
        self.log_alpha = torch.tensor(np.log(_init_temperature)).to(
            self.device)
        self.log_alpha.requires_grad = True
        self.target_entropy = -_act_dim

        # optimizers
        if _optimizer_type == "adam":
            optimizer = torch.optim.Adam
        elif _optimizer_type == "sgd":
            optimizer = torch.optim.SGD
        else:
            raise NotImplementedError()

        self.actor_optimizer = optimizer(self.actor.parameters(),
                                         lr=_actor_lr,
                                         weight_decay=_actor_weight_decay)
        self.critic_optimizer = optimizer(self.critic.parameters(),
                                          lr=_critic_lr)
        self.log_alpha_optimizer = optimizer([self.log_alpha], lr=_alpha_lr)
コード例 #6
0
def test_actor_loss_decrease():
    env = gym.make("InvertedPendulum-v2")
    a = TanhGaussianActor(env.observation_space, env.action_space, [256, 256])
    c = DoubleQCritic(env.observation_space, env.action_space, [256, 256])
    s = SAC(actor=a,
            critic=c,
            _device="cuda",
            _act_dim=len(env.action_space.low))

    batch = create_batch(env)
    batch = {"obs": batch["obs"]}
    s.update_actor_and_alpha(**batch)
    loss_before = s.log_local_epoch()["actor/loss"]
    for _ in range(200):
        s.update_actor_and_alpha(**batch)
    loss_after = s.log_local_epoch()["actor/loss"]
    assert loss_after < loss_before + 0.2
コード例 #7
0
def test_integration():
    env = gym.make("InvertedPendulum-v2")

    for device in "cpu", "cuda":
        a = TanhGaussianActor(env.observation_space, env.action_space,
                              [256, 256])
        c = DoubleQCritic(env.observation_space, env.action_space, [256, 256])
        s = SAC(actor=a,
                critic=c,
                _device=device,
                _act_dim=len(env.action_space.low))

        print(flatten(s.log_hyperparams()).keys())

        batch = create_batch(env, device)
        for t in range(10):
            s.update(batch, t)

        print(flatten(s.log_epoch()).keys())
コード例 #8
0
def test_critic_value_increase():
    env = gym.make("InvertedPendulum-v2")
    a = TanhGaussianActor(env.observation_space, env.action_space, [256, 256])
    c = DoubleQCritic(env.observation_space, env.action_space, [256, 256])
    s = SAC(actor=a,
            critic=c,
            _device="cuda",
            _act_dim=len(env.action_space.low))

    batch = create_batch(env)

    s.update_critic(**batch)
    q1_before = s.log_local_epoch()["critic/q1"].mean()
    q2_before = s.log_local_epoch()["critic/q2"].mean()
    for _ in range(200):
        s.update_critic(**batch)
    q1_after = s.log_local_epoch()["critic/q1"].mean()
    q2_after = s.log_local_epoch()["critic/q2"].mean()
    assert q1_after > q1_before - 0.2
    assert q2_after > q2_before - 0.2