def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
def __init__( self, env, train_tasks, eval_tasks, latent_dim, nets, policy_lr=3e-4, qf_lr=1e-3, vf_lr=1e-3, alpha=1, automatic_entropy_tuning=True, lr=3e-4, context_lr=3e-4, kl_lambda=1., policy_mean_reg_weight=1e-3, policy_std_reg_weight=1e-3, policy_pre_activation_weight=0., optimizer_class=optim.Adam, recurrent=False, use_information_bottleneck=True, sparse_rewards=False, #soft_target_tau=1e-2, soft_target_tau=0.005, plotter=None, render_eval_paths=False, **kwargs): super().__init__(env=env, agent=nets[0], train_tasks=train_tasks, eval_tasks=eval_tasks, **kwargs) self.soft_target_tau = soft_target_tau self.policy_mean_reg_weight = policy_mean_reg_weight self.policy_std_reg_weight = policy_std_reg_weight self.policy_pre_activation_weight = policy_pre_activation_weight self.plotter = plotter self.render_eval_paths = render_eval_paths self.alpha = alpha self.automatic_entropy_tuning = automatic_entropy_tuning self.recurrent = recurrent self.latent_dim = latent_dim # self.qf_criterion = nn.MSELoss() # self.vf_criterion = nn.MSELoss() self.vib_criterion = nn.MSELoss() self.l2_reg_criterion = nn.MSELoss() self.kl_lambda = kl_lambda self.use_information_bottleneck = use_information_bottleneck self.sparse_rewards = sparse_rewards # self.qf1, self.qf2, self.vf = nets[1:] self.critic, self.critic_target = nets[1:] #q1,q2,target q1,target q2 # self.target_vf = self.vf.copy() self.critic_optimizer = Adam(self.critic.parameters(), lr=lr) hard_update(self.critic_target, self.critic) self.policy_optimizer = Adam(self.agent.policy.parameters(), lr=policy_lr) self.target_entropy = -torch.prod(torch.Tensor( env.action_space.shape)).to( "cuda").item() # torch.prod(input) : 返回所有元素的乘积 self.log_alpha = torch.zeros(1, requires_grad=True, device="cuda") self.alpha_optim = Adam([self.log_alpha], lr=lr) # self.policy_optimizer = optimizer_class( # self.agent.policy.parameters(), # lr=policy_lr, # ) # self.qf1_optimizer = optimizer_class( # self.qf1.parameters(), # lr=qf_lr, # ) # self.qf2_optimizer = optimizer_class( # self.qf2.parameters(), # lr=qf_lr, # ) # self.vf_optimizer = optimizer_class( # self.vf.parameters(), # lr=vf_lr, # ) self.context_optimizer = optimizer_class( self.agent.context_encoder.parameters(), lr=context_lr, )
def __init__(self, action_space, policy: str = "Gaussian", gamma: float = 0.99, tau: float = 0.005, lr: float = 0.0003, alpha: float = 0.2, automatic_temperature_tuning: bool = False, batch_size: int = 256, hidden_size: int = 256, target_update_interval: int = 1, input_dim: int = 32): self.gamma = gamma self.tau = tau self.alpha = alpha self.lr = lr self.policy_type = policy self.target_update_interval = target_update_interval self.automatic_temperature_tuning = automatic_temperature_tuning self.input_dim = input_dim self.hidden_size = hidden_size self.bs = batch_size self.critic = QNetwork(input_dim, action_space.shape[0], hidden_size).to(DEVICE) self.critic_optim = Adam(self.critic.parameters(), lr=self.lr) self.critic_target = QNetwork(input_dim, action_space.shape[0], hidden_size).to(DEVICE) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": if self.automatic_temperature_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(DEVICE)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=DEVICE) self.alpha_optim = Adam([self.log_alpha], lr=self.lr) self.policy = GaussianPolicy(input_dim, action_space.shape[0], hidden_size).to(DEVICE) self.policy_optim = Adam(self.policy.parameters(), lr=self.lr) else: self.alpha = 0 self.automatic_temperature_tuning = False self.policy = DeterministicPolicy(input_dim, action_space.shape[0], hidden_size).to(DEVICE) self.policy_optim = Adam(self.policy.parameters(), lr=self.lr) settings = ( f"INITIALIZING SAC ALGORITHM WITH {self.policy_type} POLICY" f"\nRunning on: {DEVICE}" f"\nSettings: Automatic Temperature tuning = {self.automatic_temperature_tuning}, Update Interval = {self.target_update_interval}" f"\nParameters: Learning rate = {self.lr}, Batch Size = {self.bs} Gamma = {self.gamma}, Tau = {self.tau}, Alpha = {self.alpha}" f"\nArchitecture: Input dimension = {self.input_dim}, Hidden layer dimension = {self.hidden_size}" "\n--------------------------") print(settings)