def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
class SAC(object): def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda") self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) def select_action(self, state, eval=False): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) if eval == False: action, _, _ = self.policy.sample(state) else: _, _, action = self.policy.sample(state) return action.detach().cpu().numpy()[0] def select_action_batch(self, states, eval=False): states = torch.FloatTensor(states).to(self.device) if eval == False: action, _, _ = self.policy.sample(states) else: _, _, action = self.policy.sample(states) return action.detach().cpu().numpy() def update_parameters(self, memory, batch_size, updates): # Sample a batch from memory # state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(batch_size=batch_size) state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample( next_state_batch) qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] pi, log_pi, _ = self.policy.sample(state_batch) qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean( ) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] self.critic_optim.zero_grad() qf1_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() qf2_loss.backward() self.critic_optim.step() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(self.device) alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs if updates % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), alpha_loss.item(), alpha_tlogs.item() def weighted_update_parameters(self, memory, batch_size, updates, temperature_act=5.0, temperature=5.0): # Sample a batch from memory # state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(batch_size=batch_size) state_batch, action_batch, reward_batch, next_state_batch, mask_batch, std_batch = memory state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) std_batch = torch.FloatTensor(std_batch).to(self.device).unsqueeze(1) weight_actor_Q = torch.sigmoid(-std_batch * temperature_act) + 0.5 weight_target_Q = torch.sigmoid(-std_batch * temperature) + 0.5 with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample( next_state_batch) qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss(qf1, next_q_value.detach()) * ( weight_target_Q.detach() ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss(qf2, next_q_value.detach()) * ( weight_target_Q.detach() ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] pi, log_pi, _ = self.policy.sample(state_batch) qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) # weigted policy_loss policy_loss = ( ((self.alpha * log_pi) - min_qf_pi) * weight_actor_Q.detach() ).mean() # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] self.critic_optim.zero_grad() qf1_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() qf2_loss.backward() self.critic_optim.step() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(self.device) alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs if updates % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), alpha_loss.item(), alpha_tlogs.item() # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) print('Saving models to {} and {}'.format(actor_path, critic_path)) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path): print('Loading models from {} and {}'.format(actor_path, critic_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path))
def __init__(self, action_space, policy: str = "Gaussian", gamma: float = 0.99, tau: float = 0.005, lr: float = 0.0003, alpha: float = 0.2, automatic_temperature_tuning: bool = False, batch_size: int = 256, hidden_size: int = 256, target_update_interval: int = 1, input_dim: int = 32): self.gamma = gamma self.tau = tau self.alpha = alpha self.lr = lr self.policy_type = policy self.target_update_interval = target_update_interval self.automatic_temperature_tuning = automatic_temperature_tuning self.input_dim = input_dim self.hidden_size = hidden_size self.bs = batch_size self.critic = QNetwork(input_dim, action_space.shape[0], hidden_size).to(DEVICE) self.critic_optim = Adam(self.critic.parameters(), lr=self.lr) self.critic_target = QNetwork(input_dim, action_space.shape[0], hidden_size).to(DEVICE) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": if self.automatic_temperature_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(DEVICE)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=DEVICE) self.alpha_optim = Adam([self.log_alpha], lr=self.lr) self.policy = GaussianPolicy(input_dim, action_space.shape[0], hidden_size).to(DEVICE) self.policy_optim = Adam(self.policy.parameters(), lr=self.lr) else: self.alpha = 0 self.automatic_temperature_tuning = False self.policy = DeterministicPolicy(input_dim, action_space.shape[0], hidden_size).to(DEVICE) self.policy_optim = Adam(self.policy.parameters(), lr=self.lr) settings = ( f"INITIALIZING SAC ALGORITHM WITH {self.policy_type} POLICY" f"\nRunning on: {DEVICE}" f"\nSettings: Automatic Temperature tuning = {self.automatic_temperature_tuning}, Update Interval = {self.target_update_interval}" f"\nParameters: Learning rate = {self.lr}, Batch Size = {self.bs} Gamma = {self.gamma}, Tau = {self.tau}, Alpha = {self.alpha}" f"\nArchitecture: Input dimension = {self.input_dim}, Hidden layer dimension = {self.hidden_size}" "\n--------------------------") print(settings)
class SAC(object): """ SAC (Soft actor critic) algorithm. Objects contain the Q networks and optimizers, as well as functions to safe and load models, selecting actions and perform updates for the objective functions. ## Parameters: - **num_inputs** *(int)*: dimension of input (In this case number of variables of the latent representation) - **action_space**: action space of environment (E.g. for car racer: Box(3,) which means that the action space has 3 actions that are continuous.) - **args**: namespace with needed arguments (such as discount factor, used policy and temperature parameter) """ def __init__(self, action_space, policy: str = "Gaussian", gamma: float = 0.99, tau: float = 0.005, lr: float = 0.0003, alpha: float = 0.2, automatic_temperature_tuning: bool = False, batch_size: int = 256, hidden_size: int = 256, target_update_interval: int = 1, input_dim: int = 32): self.gamma = gamma self.tau = tau self.alpha = alpha self.lr = lr self.policy_type = policy self.target_update_interval = target_update_interval self.automatic_temperature_tuning = automatic_temperature_tuning self.input_dim = input_dim self.hidden_size = hidden_size self.bs = batch_size self.critic = QNetwork(input_dim, action_space.shape[0], hidden_size).to(DEVICE) self.critic_optim = Adam(self.critic.parameters(), lr=self.lr) self.critic_target = QNetwork(input_dim, action_space.shape[0], hidden_size).to(DEVICE) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": if self.automatic_temperature_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(DEVICE)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=DEVICE) self.alpha_optim = Adam([self.log_alpha], lr=self.lr) self.policy = GaussianPolicy(input_dim, action_space.shape[0], hidden_size).to(DEVICE) self.policy_optim = Adam(self.policy.parameters(), lr=self.lr) else: self.alpha = 0 self.automatic_temperature_tuning = False self.policy = DeterministicPolicy(input_dim, action_space.shape[0], hidden_size).to(DEVICE) self.policy_optim = Adam(self.policy.parameters(), lr=self.lr) settings = ( f"INITIALIZING SAC ALGORITHM WITH {self.policy_type} POLICY" f"\nRunning on: {DEVICE}" f"\nSettings: Automatic Temperature tuning = {self.automatic_temperature_tuning}, Update Interval = {self.target_update_interval}" f"\nParameters: Learning rate = {self.lr}, Batch Size = {self.bs} Gamma = {self.gamma}, Tau = {self.tau}, Alpha = {self.alpha}" f"\nArchitecture: Input dimension = {self.input_dim}, Hidden layer dimension = {self.hidden_size}" "\n--------------------------") print(settings) def select_action(self, state, eval=False): """ Returns an action based on a given state from policy. ## Input: - **state** *(type)*: State of the environment. In our case latent representation with 32 variables. - **eval** *(boolean)*: Indicates whether to evaluate or not. When evaluating, the mean of the action distribution is returned. ##Output: - **action[0]**: [1,3] Selected action based on policy. Array with [s: steering,a: acceleration, d:deceleartion] coefficients """ state = torch.FloatTensor(state).to(DEVICE).unsqueeze(0) if eval == False: action, _, _ = self.policy.sample(state) else: _, _, action = self.policy.sample(state) action = action.detach().cpu().numpy() return action[0] def update_parameters(self, memory, batch_size, updates): """ Computes loss and updates parameters of objective functions (Q functions, policy and alpha). ## Input: - **memory**: instance of class ReplayMemory - **batch_size** *(int)*: batch size that shall be sampled from memory - **updates**: indicates the number of the update steps already done ## Output: - **qf1_loss.item()**: loss of first q function - **qf2_loss.item()**: loss of second q function - **policy_loss.item()**: loss of policy - **alpha_loss.item()**: loss of alpha - **alpha_tlogs.item()**: alpha tlogs (For TensorboardX logs) """ # Sample a batch from memory state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample( batch_size=batch_size) state_batch = torch.FloatTensor(state_batch).to(DEVICE) next_state_batch = torch.FloatTensor(next_state_batch).to(DEVICE) action_batch = torch.FloatTensor(action_batch).to(DEVICE) reward_batch = torch.FloatTensor(reward_batch).to(DEVICE).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(DEVICE).unsqueeze(1) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample( next_state_batch) qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] pi, log_pi, _ = self.policy.sample(state_batch) qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean( ) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] self.critic_optim.zero_grad() qf1_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() qf2_loss.backward() self.critic_optim.step() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() if self.automatic_temperature_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(DEVICE) alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs if updates % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), alpha_loss.item(), alpha_tlogs.item() # Save model parameters def save_model(self, env_name: str, identifier: str, suffix: str = ".pt", actor_path=None, critic_path=None): path = Path("models/") if not path.exists(): path.mkdir() if actor_path is None: actor_path = ( path / f"sac_actor_{env_name}_{identifier}").with_suffix(suffix) if critic_path is None: critic_path = ( path / f"sac_critic_{env_name}_{identifier}").with_suffix(suffix) print(f"Saving models to {actor_path} and {critic_path}") torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters """ Insert your description here. ## Parameters: - **param1** *(type)*: ## Input: - **Input 1** *(shapes)*: ## Output: - **Output 1** *(shapes)*: """ def load_model(self, actor_path, critic_path): print(f"Loading models from {actor_path} and {critic_path}") if actor_path is not None: self.policy.load_state_dict( torch.load(actor_path, map_location=DEVICE)) if critic_path is not None: self.critic.load_state_dict( torch.load(critic_path, map_location=DEVICE))