def test_no_nan(): """Test for no nans in all parameters. The main takeaway from this test is that you must set the learning rates low or else the parameters will tend to nan. """ env = gym.make("InvertedPendulum-v2") a = TanhGaussianActor(env.observation_space, env.action_space, [256, 256]) c = DoubleQCritic(env.observation_space, env.action_space, [256, 256]) s = SAC(actor=a, critic=c, _device="cuda", _act_dim=len(env.action_space.low)) batch = create_batch(env) for t in range(200): print("iteration", t) s.update(batch, t) for key, v in a.state_dict().items(): print("actor", key) assert torch.any(torch.isnan(v)) == False for key, v in c.state_dict().items(): print("actor", key) assert torch.any(torch.isnan(v)) == False
def test_random_space(): for _ in range(100): obs_spec = create_random_space() act_spec = create_random_space() print(obs_spec) print(act_spec) c1 = DoubleQCritic(obs_spec, act_spec, [60, 50]) c2 = DoubleVCritic(obs_spec, [60, 50]) obs = torchify(obs_spec.sample()) act = torchify(act_spec.sample()) c1.forward(obs, act) c2.forward(obs)
def test_integration(): obs_spec = Box(low=np.zeros(10, dtype=np.float32), high=np.ones(10, dtype=np.float32)) act_spec = Box(low=np.zeros(3, dtype=np.float32), high=np.ones(3, dtype=np.float32)) c1 = DoubleQCritic(obs_spec, act_spec, [60, 50]) c2 = DoubleVCritic(obs_spec, [60, 50]) obs = torch.rand((100, 10)) act = torch.rand((100, 3)) q = c1.forward(obs, act) assert q.shape == (100, 1) v = c2.forward(obs) assert v.shape == (100, 1)
def test_critic_target_update(): env = gym.make("InvertedPendulum-v2") a = TanhGaussianActor(env.observation_space, env.action_space, [256, 256]) c = DoubleQCritic(env.observation_space, env.action_space, [256, 256]) s = SAC( actor=a, critic=c, _device="cuda", _act_dim=len(env.action_space.low), _critic_target_update_frequency=200, ) batch = create_batch(env) cp_before = s.critic_target.state_dict() for t in range(100): s.update(batch, t + 1) cp_after = s.critic_target.state_dict() for k, v in cp_before.items(): v2 = cp_after[k] assert torch.all(v == v2) cp_before = {k: v.clone() for k, v in s.critic_target.state_dict().items()} s.update(batch, 1000) cp_after = s.critic_target.state_dict() for k, v in cp_before.items(): v2 = cp_after[k] assert not torch.all(v == v2)
def __init__( self, actor: GaussianActor, critic: DoubleQCritic, _device: str, _act_dim: int, _critic_tau: float = 5e-3, _discount: float = 0.99, _init_temperature: float = 0.1, _learnable_temperature: bool = True, _actor_update_frequency: int = 1, _critic_target_update_frequency: int = 1, _optimizer_type: str = "adam", _alpha_lr: float = 3e-4, _actor_lr: float = 3e-4, _actor_weight_decay: float = 1e-2, _critic_lr: float = 3e-4, ) -> None: super().__init__() # set other parameters self.device = torch.device(_device) self.discount = _discount self.critic_tau = _critic_tau self.actor_update_frequency = _actor_update_frequency self.critic_target_update_frequency = _critic_target_update_frequency self.learnable_temperature = _learnable_temperature # instantiate actor and critic self.actor = actor.to(self.device) self.critic = critic.to(self.device) self.critic_target = copy.deepcopy(critic) self.critic_target.eval() # instantiate log alpha self.log_alpha = torch.tensor(np.log(_init_temperature)).to( self.device) self.log_alpha.requires_grad = True self.target_entropy = -_act_dim # optimizers if _optimizer_type == "adam": optimizer = torch.optim.Adam elif _optimizer_type == "sgd": optimizer = torch.optim.SGD else: raise NotImplementedError() self.actor_optimizer = optimizer(self.actor.parameters(), lr=_actor_lr, weight_decay=_actor_weight_decay) self.critic_optimizer = optimizer(self.critic.parameters(), lr=_critic_lr) self.log_alpha_optimizer = optimizer([self.log_alpha], lr=_alpha_lr)
def test_actor_loss_decrease(): env = gym.make("InvertedPendulum-v2") a = TanhGaussianActor(env.observation_space, env.action_space, [256, 256]) c = DoubleQCritic(env.observation_space, env.action_space, [256, 256]) s = SAC(actor=a, critic=c, _device="cuda", _act_dim=len(env.action_space.low)) batch = create_batch(env) batch = {"obs": batch["obs"]} s.update_actor_and_alpha(**batch) loss_before = s.log_local_epoch()["actor/loss"] for _ in range(200): s.update_actor_and_alpha(**batch) loss_after = s.log_local_epoch()["actor/loss"] assert loss_after < loss_before + 0.2
def test_integration(): env = gym.make("InvertedPendulum-v2") for device in "cpu", "cuda": a = TanhGaussianActor(env.observation_space, env.action_space, [256, 256]) c = DoubleQCritic(env.observation_space, env.action_space, [256, 256]) s = SAC(actor=a, critic=c, _device=device, _act_dim=len(env.action_space.low)) print(flatten(s.log_hyperparams()).keys()) batch = create_batch(env, device) for t in range(10): s.update(batch, t) print(flatten(s.log_epoch()).keys())
def test_critic_value_increase(): env = gym.make("InvertedPendulum-v2") a = TanhGaussianActor(env.observation_space, env.action_space, [256, 256]) c = DoubleQCritic(env.observation_space, env.action_space, [256, 256]) s = SAC(actor=a, critic=c, _device="cuda", _act_dim=len(env.action_space.low)) batch = create_batch(env) s.update_critic(**batch) q1_before = s.log_local_epoch()["critic/q1"].mean() q2_before = s.log_local_epoch()["critic/q2"].mean() for _ in range(200): s.update_critic(**batch) q1_after = s.log_local_epoch()["critic/q1"].mean() q2_after = s.log_local_epoch()["critic/q2"].mean() assert q1_after > q1_before - 0.2 assert q2_after > q2_before - 0.2