Example #1
0
 def __init__(self, state_dim, action_dim, device, policy_hidden_units):
     self._state_dim = state_dim
     self._action_dim = action_dim
     self._device = device
     self._policy_net = GaussianPolicy(
         state_dim=self._state_dim,
         action_dim=self._action_dim,
         hidden_units=policy_hidden_units).eval().to(self._device)
     disable_gradients(self._policy_net)
Example #2
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 device,
                 gamma=0.99,
                 nstep=1,
                 policy_lr=0.0003,
                 q_lr=0.0003,
                 entropy_lr=0.0003,
                 policy_hidden_units=[256, 256],
                 q_hidden_units=[256, 256],
                 target_update_coef=0.005,
                 log_interval=10,
                 seed=0):
        super().__init__(state_dim, action_dim, device, gamma, nstep,
                         log_interval, seed)

        # Build networks.
        self._policy_net = GaussianPolicy(state_dim=self._state_dim,
                                          action_dim=self._action_dim,
                                          hidden_units=policy_hidden_units).to(
                                              self._device)
        self._online_q_net = TwinnedStateActionFunction(
            state_dim=self._state_dim,
            action_dim=self._action_dim,
            hidden_units=q_hidden_units).to(self._device)
        self._target_q_net = TwinnedStateActionFunction(
            state_dim=self._state_dim,
            action_dim=self._action_dim,
            hidden_units=q_hidden_units).to(self._device).eval()

        # Copy parameters of the learning network to the target network.
        self._target_q_net.load_state_dict(self._online_q_net.state_dict())

        # Disable gradient calculations of the target network.
        disable_gradients(self._target_q_net)

        # Optimizers.
        self._policy_optim = Adam(self._policy_net.parameters(), lr=policy_lr)
        self._q_optim = Adam(self._online_q_net.parameters(), lr=q_lr)

        # Target entropy is -|A|.
        self._target_entropy = -float(self._action_dim)

        # We optimize log(alpha), instead of alpha.
        self._log_alpha = torch.zeros(1,
                                      device=self._device,
                                      requires_grad=True)
        self._alpha = self._log_alpha.detach().exp()
        self._alpha_optim = Adam([self._log_alpha], lr=entropy_lr)

        self._target_update_coef = target_update_coef
Example #3
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 device,
                 gamma=0.99,
                 nstep=1,
                 policy_lr=0.0003,
                 q_lr=0.0003,
                 entropy_lr=0.0003,
                 error_lr=0.0003,
                 policy_hidden_units=[256, 256],
                 q_hidden_units=[256, 256],
                 error_hidden_units=[256, 256, 256],
                 tau_init=10.0,
                 target_update_coef=0.005,
                 log_interval=10,
                 seed=0):
        super().__init__(state_dim, action_dim, device, gamma, nstep,
                         policy_lr, q_lr, entropy_lr, policy_hidden_units,
                         q_hidden_units, target_update_coef, log_interval,
                         seed)

        # Build error networks.
        self._online_error_net = TwinnedStateActionFunction(
            state_dim=state_dim,
            action_dim=action_dim,
            hidden_units=error_hidden_units).to(device=self._device)
        self._target_error_net = TwinnedStateActionFunction(
            state_dim=state_dim,
            action_dim=action_dim,
            hidden_units=error_hidden_units).to(device=self._device).eval()

        # Copy parameters of the learning network to the target network.
        self._target_error_net.load_state_dict(
            self._online_error_net.state_dict())

        # Disable gradient calculations of the target network.
        disable_gradients(self._target_error_net)

        self._error_optim = Adam(self._online_error_net.parameters(),
                                 lr=error_lr)

        self._tau1 = torch.tensor(tau_init,
                                  device=self._device,
                                  requires_grad=False)
        self._tau2 = torch.tensor(tau_init,
                                  device=self._device,
                                  requires_grad=False)
Example #4
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 device,
                 gamma=0.99,
                 nstep=1,
                 policy_lr=0.0003,
                 q_lr=0.0003,
                 entropy_lr=0.0003,
                 policy_hidden_units=[256, 256],
                 q_hidden_units=[256, 256],
                 target_update_coef=0.005,
                 log_interval=10,
                 seed=0):
        super().__init__(state_dim, action_dim, device, gamma, nstep,
                         log_interval, seed)

        # Build networks.
        self._online_q_net = TwinnedDQNNet(state_dim=self._state_dim,
                                           action_dim=self._action_dim,
                                           hidden_units=q_hidden_units).to(
                                               self._device)
        self._target_q_net = TwinnedDQNNet(state_dim=self._state_dim,
                                           action_dim=self._action_dim,
                                           hidden_units=q_hidden_units).to(
                                               self._device).eval()

        # Copy parameters of the learning network to the target network.
        self._target_q_net.load_state_dict(self._online_q_net.state_dict())

        # Disable gradient calculations of the target network.
        disable_gradients(self._target_q_net)

        # Optimizers.
        self._q_optim = Adam(self._online_q_net.parameters(), lr=q_lr)

        self._target_update_coef = target_update_coef
Example #5
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 device,
                 gamma=0.99,
                 nstep=1,
                 policy_lr=0.0003,
                 q_lr=0.0003,
                 entropy_lr=0.0003,
                 error_lr=0.0003,
                 policy_hidden_units=[256, 256],
                 q_hidden_units=[256, 256],
                 error_hidden_units=[256, 256, 256],
                 prob_hidden_units=[128, 128],
                 prob_temperature=7.5,
                 tau_init=10.0,
                 target_update_coef=0.005,
                 lfiw=False,
                 tau_scale=1,
                 hard_tper_weight=0.4,
                 log_interval=10,
                 seed=0,
                 discor=False,
                 tper=False,
                 log_dir=None,
                 env=None,
                 eval_tper=False,
                 use_backward_timestep=False,
                 reweigh_type="hard",
                 reweigh_hyper=None):
        super().__init__(state_dim, action_dim, device, gamma, nstep,
                         policy_lr, q_lr, entropy_lr, policy_hidden_units,
                         q_hidden_units, target_update_coef, log_interval,
                         seed, env, eval_tper, log_dir)

        self.discor = discor
        self.lfiw = lfiw
        self.tper = tper
        # Build error networks.
        if self.discor:
            self._online_error_net = TwinnedStateActionFunction(
                state_dim=state_dim,
                action_dim=action_dim,
                hidden_units=error_hidden_units).to(device=self._device)
            self._target_error_net = TwinnedStateActionFunction(
                state_dim=state_dim,
                action_dim=action_dim,
                hidden_units=error_hidden_units).to(
                    device=self._device).eval()
            # Copy parameters of the learning network to the target network.
            self._target_error_net.load_state_dict(
                self._online_error_net.state_dict())
            # Disable gradient calculations of the target network.
            disable_gradients(self._target_error_net)

            self._error_optim = Adam(self._online_error_net.parameters(),
                                     lr=error_lr)
            self._tau1 = torch.tensor(tau_init,
                                      device=self._device,
                                      requires_grad=False)
            self._tau2 = torch.tensor(tau_init,
                                      device=self._device,
                                      requires_grad=False)

            if tau_init < 1e-6:
                self.no_tau = True
                print("===========No tau!==========")
            else:
                self.no_tau = False
            self.tau_scale = tau_scale

        if self.lfiw:
            self._prob_classifier = FlattenMlp(
                input_size=state_dim + action_dim,
                output_size=1,
                hidden_sizes=prob_hidden_units,
            ).to(device=self._device)
            self._prob_optim = Adam(self._prob_classifier.parameters(),
                                    lr=q_lr)
            self.prob_temperature = prob_temperature

        if self.tper:
            self.hard_tper_weight = hard_tper_weight
            self.use_backward_timestep = use_backward_timestep
            self.reweigh_type = reweigh_type
            self.reweigh_hyper = reweigh_hyper
            self.l, self.h, self.k, self.b = \
                [torch.tensor(i).to(device=self._device) for i in self.reweigh_hyper["linear"]]
            if self.reweigh_type in ["adaptive_linear", "done_cnt_linear"]:
                self.low_l, self.low_h, self.high_l, self.high_h, self.t_s, self.t_e = \
                    [torch.tensor(i).to(device=self._device) for i in self.reweigh_hyper["adaptive_linear"]]
            if "exp" in self.reweigh_type:
                self.exp_k, self.exp_gamma = self.reweigh_hyper["exp"]
        self.Qs = 2

        self._param_dir = os.path.join(log_dir, 'param')
        if not os.path.exists(self._param_dir):
            os.makedirs(self._param_dir)
        with open(os.path.join(self._param_dir, "discor_params.txt"),
                  'w') as f:
            for key, value in zip(locals().keys(), locals().values()):
                print(key, ":", value, file=f)