Ejemplo n.º 1
0
 def sac_lr(self, train_config):
     # not used for training, only used for testing apis
     c = train_config
     actor = smw(Actor(c.observe_dim, c.action_dim, c.action_range)
                 .to(c.device), c.device, c.device)
     critic = smw(Critic(c.observe_dim, c.action_dim)
                  .to(c.device), c.device, c.device)
     critic_t = smw(Critic(c.observe_dim, c.action_dim)
                    .to(c.device), c.device, c.device)
     critic2 = smw(Critic(c.observe_dim, c.action_dim)
                   .to(c.device), c.device, c.device)
     critic2_t = smw(Critic(c.observe_dim, c.action_dim)
                     .to(c.device), c.device, c.device)
     lr_func = gen_learning_rate_func([(0, 1e-3), (200000, 3e-4)],
                                      logger=logger)
     with pytest.raises(TypeError, match="missing .+ positional argument"):
         _ = SAC(actor, critic, critic_t, critic2, critic2_t,
                 t.optim.Adam,
                 nn.MSELoss(reduction='sum'),
                 replay_device=c.device,
                 replay_size=c.replay_size,
                 lr_scheduler=LambdaLR)
     sac = SAC(actor, critic, critic_t, critic2, critic2_t,
               t.optim.Adam,
               nn.MSELoss(reduction='sum'),
               replay_device=c.device,
               replay_size=c.replay_size,
               lr_scheduler=LambdaLR,
               lr_scheduler_args=((lr_func,), (lr_func,), (lr_func,)))
     return sac
Ejemplo n.º 2
0
 def sac(self, train_config, device, dtype):
     c = train_config
     actor = smw(
         Actor(c.observe_dim, c.action_dim,
               c.action_range).type(dtype).to(device),
         device,
         device,
     )
     critic = smw(
         Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device,
         device)
     critic_t = smw(
         Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device,
         device)
     critic2 = smw(
         Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device,
         device)
     critic2_t = smw(
         Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device,
         device)
     sac = SAC(
         actor,
         critic,
         critic_t,
         critic2,
         critic2_t,
         t.optim.Adam,
         nn.MSELoss(reduction="sum"),
         replay_device="cpu",
         replay_size=c.replay_size,
     )
     return sac
Ejemplo n.º 3
0
 def sac_vis(self, train_config, device, dtype, tmpdir):
     # not used for training, only used for testing apis
     c = train_config
     tmp_dir = tmpdir.make_numbered_dir()
     actor = smw(
         Actor(c.observe_dim, c.action_dim,
               c.action_range).type(dtype).to(device), device, device)
     critic = smw(
         Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device,
         device)
     critic_t = smw(
         Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device,
         device)
     critic2 = smw(
         Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device,
         device)
     critic2_t = smw(
         Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device,
         device)
     sac = SAC(actor,
               critic,
               critic_t,
               critic2,
               critic2_t,
               t.optim.Adam,
               nn.MSELoss(reduction='sum'),
               replay_device="cpu",
               replay_size=c.replay_size,
               visualize=True,
               visualize_dir=str(tmp_dir))
     return sac
Ejemplo n.º 4
0
    def test_config_init(self, train_config):
        c = train_config
        config = SAC.generate_config({})
        config["frame_config"]["models"] = [
            "Actor",
            "Critic",
            "Critic",
            "Critic",
            "Critic",
        ]
        config["frame_config"]["model_kwargs"] = [
            {
                "state_dim": c.observe_dim,
                "action_dim": c.action_dim,
                "action_range": c.action_range,
            }
        ] + [{
            "state_dim": c.observe_dim,
            "action_dim": c.action_dim
        }] * 4
        sac = SAC.init_from_config(config)

        old_state = state = t.zeros([1, c.observe_dim], dtype=t.float32)
        action = t.zeros([1, c.action_dim], dtype=t.float32)
        sac.store_transition({
            "state": {
                "state": old_state
            },
            "action": {
                "action": action
            },
            "next_state": {
                "state": state
            },
            "reward": 0,
            "terminal": False,
        })
        # heuristic entropy
        sac.target_entropy = -c.action_dim
        sac.update()
Ejemplo n.º 5
0
 def sac(self, train_config):
     c = train_config
     actor = smw(Actor(c.observe_dim, c.action_dim, c.action_range)
                 .to(c.device), c.device, c.device)
     critic = smw(Critic(c.observe_dim, c.action_dim)
                  .to(c.device), c.device, c.device)
     critic_t = smw(Critic(c.observe_dim, c.action_dim)
                    .to(c.device), c.device, c.device)
     critic2 = smw(Critic(c.observe_dim, c.action_dim)
                   .to(c.device), c.device, c.device)
     critic2_t = smw(Critic(c.observe_dim, c.action_dim)
                     .to(c.device), c.device, c.device)
     sac = SAC(actor, critic, critic_t, critic2, critic2_t,
               t.optim.Adam,
               nn.MSELoss(reduction='sum'),
               replay_device=c.device,
               replay_size=c.replay_size)
     return sac