def __init__(self, state_dim, action_dim, device, policy_hidden_units): self._state_dim = state_dim self._action_dim = action_dim self._device = device self._policy_net = GaussianPolicy( state_dim=self._state_dim, action_dim=self._action_dim, hidden_units=policy_hidden_units).eval().to(self._device) disable_gradients(self._policy_net)
def __init__(self, state_dim, action_dim, device, gamma=0.99, nstep=1, policy_lr=0.0003, q_lr=0.0003, entropy_lr=0.0003, policy_hidden_units=[256, 256], q_hidden_units=[256, 256], target_update_coef=0.005, log_interval=10, seed=0): super().__init__(state_dim, action_dim, device, gamma, nstep, log_interval, seed) # Build networks. self._policy_net = GaussianPolicy(state_dim=self._state_dim, action_dim=self._action_dim, hidden_units=policy_hidden_units).to( self._device) self._online_q_net = TwinnedStateActionFunction( state_dim=self._state_dim, action_dim=self._action_dim, hidden_units=q_hidden_units).to(self._device) self._target_q_net = TwinnedStateActionFunction( state_dim=self._state_dim, action_dim=self._action_dim, hidden_units=q_hidden_units).to(self._device).eval() # Copy parameters of the learning network to the target network. self._target_q_net.load_state_dict(self._online_q_net.state_dict()) # Disable gradient calculations of the target network. disable_gradients(self._target_q_net) # Optimizers. self._policy_optim = Adam(self._policy_net.parameters(), lr=policy_lr) self._q_optim = Adam(self._online_q_net.parameters(), lr=q_lr) # Target entropy is -|A|. self._target_entropy = -float(self._action_dim) # We optimize log(alpha), instead of alpha. self._log_alpha = torch.zeros(1, device=self._device, requires_grad=True) self._alpha = self._log_alpha.detach().exp() self._alpha_optim = Adam([self._log_alpha], lr=entropy_lr) self._target_update_coef = target_update_coef
def __init__(self, state_dim, action_dim, device, gamma=0.99, nstep=1, policy_lr=0.0003, q_lr=0.0003, entropy_lr=0.0003, error_lr=0.0003, policy_hidden_units=[256, 256], q_hidden_units=[256, 256], error_hidden_units=[256, 256, 256], tau_init=10.0, target_update_coef=0.005, log_interval=10, seed=0): super().__init__(state_dim, action_dim, device, gamma, nstep, policy_lr, q_lr, entropy_lr, policy_hidden_units, q_hidden_units, target_update_coef, log_interval, seed) # Build error networks. self._online_error_net = TwinnedStateActionFunction( state_dim=state_dim, action_dim=action_dim, hidden_units=error_hidden_units).to(device=self._device) self._target_error_net = TwinnedStateActionFunction( state_dim=state_dim, action_dim=action_dim, hidden_units=error_hidden_units).to(device=self._device).eval() # Copy parameters of the learning network to the target network. self._target_error_net.load_state_dict( self._online_error_net.state_dict()) # Disable gradient calculations of the target network. disable_gradients(self._target_error_net) self._error_optim = Adam(self._online_error_net.parameters(), lr=error_lr) self._tau1 = torch.tensor(tau_init, device=self._device, requires_grad=False) self._tau2 = torch.tensor(tau_init, device=self._device, requires_grad=False)
def __init__(self, state_dim, action_dim, device, gamma=0.99, nstep=1, policy_lr=0.0003, q_lr=0.0003, entropy_lr=0.0003, policy_hidden_units=[256, 256], q_hidden_units=[256, 256], target_update_coef=0.005, log_interval=10, seed=0): super().__init__(state_dim, action_dim, device, gamma, nstep, log_interval, seed) # Build networks. self._online_q_net = TwinnedDQNNet(state_dim=self._state_dim, action_dim=self._action_dim, hidden_units=q_hidden_units).to( self._device) self._target_q_net = TwinnedDQNNet(state_dim=self._state_dim, action_dim=self._action_dim, hidden_units=q_hidden_units).to( self._device).eval() # Copy parameters of the learning network to the target network. self._target_q_net.load_state_dict(self._online_q_net.state_dict()) # Disable gradient calculations of the target network. disable_gradients(self._target_q_net) # Optimizers. self._q_optim = Adam(self._online_q_net.parameters(), lr=q_lr) self._target_update_coef = target_update_coef
def __init__(self, state_dim, action_dim, device, gamma=0.99, nstep=1, policy_lr=0.0003, q_lr=0.0003, entropy_lr=0.0003, error_lr=0.0003, policy_hidden_units=[256, 256], q_hidden_units=[256, 256], error_hidden_units=[256, 256, 256], prob_hidden_units=[128, 128], prob_temperature=7.5, tau_init=10.0, target_update_coef=0.005, lfiw=False, tau_scale=1, hard_tper_weight=0.4, log_interval=10, seed=0, discor=False, tper=False, log_dir=None, env=None, eval_tper=False, use_backward_timestep=False, reweigh_type="hard", reweigh_hyper=None): super().__init__(state_dim, action_dim, device, gamma, nstep, policy_lr, q_lr, entropy_lr, policy_hidden_units, q_hidden_units, target_update_coef, log_interval, seed, env, eval_tper, log_dir) self.discor = discor self.lfiw = lfiw self.tper = tper # Build error networks. if self.discor: self._online_error_net = TwinnedStateActionFunction( state_dim=state_dim, action_dim=action_dim, hidden_units=error_hidden_units).to(device=self._device) self._target_error_net = TwinnedStateActionFunction( state_dim=state_dim, action_dim=action_dim, hidden_units=error_hidden_units).to( device=self._device).eval() # Copy parameters of the learning network to the target network. self._target_error_net.load_state_dict( self._online_error_net.state_dict()) # Disable gradient calculations of the target network. disable_gradients(self._target_error_net) self._error_optim = Adam(self._online_error_net.parameters(), lr=error_lr) self._tau1 = torch.tensor(tau_init, device=self._device, requires_grad=False) self._tau2 = torch.tensor(tau_init, device=self._device, requires_grad=False) if tau_init < 1e-6: self.no_tau = True print("===========No tau!==========") else: self.no_tau = False self.tau_scale = tau_scale if self.lfiw: self._prob_classifier = FlattenMlp( input_size=state_dim + action_dim, output_size=1, hidden_sizes=prob_hidden_units, ).to(device=self._device) self._prob_optim = Adam(self._prob_classifier.parameters(), lr=q_lr) self.prob_temperature = prob_temperature if self.tper: self.hard_tper_weight = hard_tper_weight self.use_backward_timestep = use_backward_timestep self.reweigh_type = reweigh_type self.reweigh_hyper = reweigh_hyper self.l, self.h, self.k, self.b = \ [torch.tensor(i).to(device=self._device) for i in self.reweigh_hyper["linear"]] if self.reweigh_type in ["adaptive_linear", "done_cnt_linear"]: self.low_l, self.low_h, self.high_l, self.high_h, self.t_s, self.t_e = \ [torch.tensor(i).to(device=self._device) for i in self.reweigh_hyper["adaptive_linear"]] if "exp" in self.reweigh_type: self.exp_k, self.exp_gamma = self.reweigh_hyper["exp"] self.Qs = 2 self._param_dir = os.path.join(log_dir, 'param') if not os.path.exists(self._param_dir): os.makedirs(self._param_dir) with open(os.path.join(self._param_dir, "discor_params.txt"), 'w') as f: for key, value in zip(locals().keys(), locals().values()): print(key, ":", value, file=f)