class DQNAgent(object): def __init__(self, env, args, work_dir): self.env = env self.args = args self.work_dir = work_dir self.n_action = self.env.action_space.n self.arr_actions = np.arange(self.n_action) self.memory = ReplayMemory(self.args.buffer_size, self.args.device) self.qNetwork = ValueNetwork(self.n_action, self.env).to(self.args.device) self.targetNetwork = ValueNetwork(self.n_action, self.env).to(self.args.device) self.qNetwork.train() self.targetNetwork.eval() self.optimizer = optim.RMSprop(self.qNetwork.parameters(), lr=0.00025, eps=0.001, alpha=0.95) self.crit = nn.MSELoss() self.eps = max(self.args.eps, self.args.eps_min) self.eps_delta = ( self.eps - self.args.eps_min) / self.args.exploration_decay_speed def reset(self): return torch.cat([preprocess_state(self.env.reset(), self.env)] * 4, 1) def select_action(self, state): action_prob = np.zeros(self.n_action, np.float32) action_prob.fill(self.eps / self.n_action) max_q, max_q_index = self.qNetwork(Variable(state.to( self.args.device))).data.cpu().max(1) action_prob[max_q_index[0]] += 1 - self.eps action = np.random.choice(self.arr_actions, p=action_prob) next_state, reward, done, _ = self.env.step(action) next_state = torch.cat( [state.narrow(1, 1, 3), preprocess_state(next_state, self.env)], 1) self.memory.push( (state, torch.LongTensor([int(action)]), torch.Tensor([reward]), next_state, torch.Tensor([done]))) return next_state, reward, done, max_q[0] def run(self): state = self.reset() # init buffer for _ in range(self.args.buffer_init_size): next_state, _, done, _ = self.select_action(state) state = self.reset() if done else next_state total_frame = 0 reward_list = np.zeros(self.args.log_size, np.float32) qval_list = np.zeros(self.args.log_size, np.float32) start_time = time.time() for epi in count(): reward_list[epi % self.args.log_size] = 0 qval_list[epi % self.args.log_size] = -1e9 state = self.reset() done = False ep_len = 0 if epi % self.args.save_freq == 0: model_file = os.path.join(self.work_dir, 'model.th') with open(model_file, 'wb') as f: torch.save(self.qNetwork, f) while not done: if total_frame % self.args.sync_period == 0: self.targetNetwork.load_state_dict( self.qNetwork.state_dict()) self.eps = max(self.args.eps_min, self.eps - self.eps_delta) next_state, reward, done, qval = self.select_action(state) reward_list[epi % self.args.log_size] += reward qval_list[epi % self.args.log_size] = max( qval_list[epi % self.args.log_size], qval) state = next_state total_frame += 1 ep_len += 1 if ep_len % self.args.learn_freq == 0: batch_state, batch_action, batch_reward, batch_next_state, batch_done = self.memory.sample( self.args.batch_size) batch_q = self.qNetwork(batch_state).gather( 1, batch_action.unsqueeze(1)).squeeze(1) batch_next_q = self.targetNetwork(batch_next_state).detach( ).max(1)[0] * self.args.gamma * (1 - batch_done) loss = self.crit(batch_q, batch_reward + batch_next_q) self.optimizer.zero_grad() loss.backward() self.optimizer.step() output_str = 'episode %d frame %d time %.2fs cur_rew %.3f mean_rew %.3f cur_maxq %.3f mean_maxq %.3f' % ( epi, total_frame, time.time() - start_time, reward_list[epi % self.args.log_size], np.mean(reward_list), qval_list[epi % self.args.log_size], np.mean(qval_list)) print(output_str) logging.info(output_str)
class Agent(): def __init__(self, state_size, action_size, num_agents): state_dim = state_size #agent_input_state_dim = state_size*2 # Previos state is passed in with with the current state. action_dim = action_size self.num_agents = num_agents max_size = 100000 ### self.replay = Replay(max_size) hidden_dim = 128 self.critic_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device) self.target_critic_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device) self.actor_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device) self.target_actor_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device) for target_param, param in zip(self.target_critic_net.parameters(), self.critic_net.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_actor_net.parameters(), self.actor_net.parameters()): target_param.data.copy_(param.data) self.critic_optimizer = optim.Adam(self.critic_net.parameters(), lr=CRITIC_LEARNING_RATE) self.actor_optimizer = optim.Adam(self.actor_net.parameters(), lr=ACTOR_LEARNING_RATE) def get_action(self, state): return self.actor_net.get_action(state)[0] def add_replay(self, state, action, reward, next_state, done): for i in range(self.num_agents): self.replay.add(state[i], action[i], reward[i], next_state[i], done[i]) def learning_step(self): #Check if relay buffer contains enough samples for 1 batch if (self.replay.cursize < BATCH_SIZE): return #Get Samples state, action, reward, next_state, done = self.replay.get(BATCH_SIZE) #calculate loss actor_loss = self.critic_net(state, self.actor_net(state)) actor_loss = -actor_loss.mean() next_action = self.target_actor_net(next_state) target_value = self.target_critic_net(next_state, next_action.detach()) expected_value = reward + (1.0 - done) * DISCOUNT_RATE * target_value value = self.critic_net(state, action) critic_loss = F.mse_loss(value, expected_value.detach()) #backprop self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() #soft update self.soft_update(self.critic_net, self.target_critic_net, TAU) self.soft_update(self.actor_net, self.target_actor_net, TAU) def save(self, name): torch.save(self.critic_net.state_dict(), name + "_critic") torch.save(self.actor_net.state_dict(), name + "_actor") def load(self, name): self.critic_net.load_state_dict(torch.load(name + "_critic")) self.critic_net.eval() self.actor_net.load_state_dict(torch.load(name + "_actor")) self.actor_net.eval() for target_param, param in zip(self.target_critic_net.parameters(), self.critic_net.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_actor_net.parameters(), self.actor_net.parameters()): target_param.data.copy_(param.data) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class SAC(object): def __init__(self, num_inputs, action_space, args): self.num_inputs = num_inputs self.action_space = action_space.shape[0] self.gamma = args.gamma self.tau = args.tau self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.critic = QNetwork(self.num_inputs, self.action_space, args.hidden_size) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) if self.policy_type == "Gaussian": self.alpha = args.alpha # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape)).item() self.log_alpha = torch.zeros(1, requires_grad=True) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) else: pass self.policy = GaussianPolicy(self.num_inputs, self.action_space, args.hidden_size) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.value = ValueNetwork(self.num_inputs, args.hidden_size) self.value_target = ValueNetwork(self.num_inputs, args.hidden_size) self.value_optim = Adam(self.value.parameters(), lr=args.lr) hard_update(self.value_target, self.value) else: self.policy = DeterministicPolicy(self.num_inputs, self.action_space, args.hidden_size) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.critic_target = QNetwork(self.num_inputs, self.action_space, args.hidden_size) hard_update(self.critic_target, self.critic) def select_action(self, state, eval=False): state = torch.FloatTensor(state).unsqueeze(0) if eval == False: self.policy.train() action, _, _, _, _ = self.policy.sample(state) else: self.policy.eval() _, _, _, action, _ = self.policy.sample(state) if self.policy_type == "Gaussian": action = torch.tanh(action) else: pass #action = torch.tanh(action) action = action.detach().cpu().numpy() return action[0] def update_parameters(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch, updates): state_batch = torch.FloatTensor(state_batch) next_state_batch = torch.FloatTensor(next_state_batch) action_batch = torch.FloatTensor(action_batch) reward_batch = torch.FloatTensor(reward_batch).unsqueeze(1) mask_batch = torch.FloatTensor(np.float32(mask_batch)).unsqueeze(1) """ Use two Q-functions to mitigate positive bias in the policy improvement step that is known to degrade performance of value based methods. Two Q-functions also significantly speed up training, especially on harder task. """ expected_q1_value, expected_q2_value = self.critic( state_batch, action_batch) new_action, log_prob, _, mean, log_std = self.policy.sample( state_batch) if self.policy_type == "Gaussian": if self.automatic_entropy_tuning: """ Alpha Loss """ alpha_loss = -( self.log_alpha * (log_prob + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_logs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.) alpha_logs = self.alpha # For TensorboardX logs """ Including a separate function approximator for the soft value can stabilize training. """ expected_value = self.value(state_batch) target_value = self.value_target(next_state_batch) next_q_value = reward_batch + mask_batch * self.gamma * ( target_value).detach() else: """ There is no need in principle to include a separate function approximator for the state value. We use a target critic network for deterministic policy and eradicate the value value network completely. """ alpha_loss = torch.tensor(0.) alpha_logs = self.alpha # For TensorboardX logs next_state_action, _, _, _, _, = self.policy.sample( next_state_batch) target_critic_1, target_critic_2 = self.critic_target( next_state_batch, next_state_action) target_critic = torch.min(target_critic_1, target_critic_2) next_q_value = reward_batch + mask_batch * self.gamma * ( target_critic).detach() """ Soft Q-function parameters can be trained to minimize the soft Bellman residual JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] ∇JQ = ∇Q(st,at)(Q(st,at) - r(st,at) - γV(target)(st+1)) """ q1_value_loss = F.mse_loss(expected_q1_value, next_q_value) q2_value_loss = F.mse_loss(expected_q2_value, next_q_value) q1_new, q2_new = self.critic(state_batch, new_action) expected_new_q_value = torch.min(q1_new, q2_new) if self.policy_type == "Gaussian": """ Including a separate function approximator for the soft value can stabilize training and is convenient to train simultaneously with the other networks Update the V towards the min of two Q-functions in order to reduce overestimation bias from function approximation error. JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - α * log π(at|st)]))^2] ∇JV = ∇V(st)(V(st) - Q(st,at) + (α * logπ(at|st))) """ next_value = expected_new_q_value - (self.alpha * log_prob) value_loss = F.mse_loss(expected_value, next_value.detach()) else: pass """ Reparameterization trick is used to get a low variance estimator f(εt;st) = action sampled from the policy εt is an input noise vector, sampled from some fixed distribution Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] ∇Jπ = ∇log π + ([∇at (α * logπ(at|st)) − ∇at Q(st,at)])∇f(εt;st) """ policy_loss = ((self.alpha * log_prob) - expected_new_q_value).mean() # Regularization Loss mean_loss = 0.001 * mean.pow(2).mean() std_loss = 0.001 * log_std.pow(2).mean() policy_loss += mean_loss + std_loss self.critic_optim.zero_grad() q1_value_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() q2_value_loss.backward() self.critic_optim.step() if self.policy_type == "Gaussian": self.value_optim.zero_grad() value_loss.backward() self.value_optim.step() else: value_loss = torch.tensor(0.) self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() """ We update the target weights to match the current value function weights periodically Update target parameter after every n(args.target_update_interval) updates """ if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic": soft_update(self.critic_target, self.critic, self.tau) elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian": soft_update(self.value_target, self.value, self.tau) return value_loss.item(), q1_value_loss.item(), q2_value_loss.item( ), policy_loss.item(), alpha_loss.item(), alpha_logs # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None, value_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) if value_path is None: value_path = "models/sac_value_{}_{}".format(env_name, suffix) print('Saving models to {}, {} and {}'.format(actor_path, critic_path, value_path)) torch.save(self.value.state_dict(), value_path) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path, value_path): print('Loading models from {}, {} and {}'.format( actor_path, critic_path, value_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path)) if value_path is not None: self.value.load_state_dict(torch.load(value_path))
class SAC(object): def __init__(self, num_inputs, action_space, args): self.num_inputs = num_inputs self.action_space = action_space.shape[0] self.gamma = args.gamma self.tau = args.tau self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(self.num_inputs, self.action_space, args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) if self.policy_type == "Gaussian": self.alpha = args.alpha # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(self.num_inputs, self.action_space, args.hidden_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.value = ValueNetwork(self.num_inputs, args.hidden_size).to(self.device) self.value_target = ValueNetwork(self.num_inputs, args.hidden_size).to(self.device) self.value_optim = Adam(self.value.parameters(), lr=args.lr) hard_update(self.value_target, self.value) else: self.policy = DeterministicPolicy(self.num_inputs, self.action_space, args.hidden_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.critic_target = QNetwork(self.num_inputs, self.action_space, args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) def select_action(self, state, eval=False): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) if eval == False: self.policy.train() action, _, _ = self.policy.sample(state) else: self.policy.eval() _, _, action = self.policy.sample(state) action = action.detach().cpu().numpy() return action[0] def update_parameters(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch, updates): state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step pi, log_pi, _ = self.policy.sample(state_batch) if self.policy_type == "Gaussian": if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_logs = torch.tensor(self.alpha) # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(self.device) alpha_logs = torch.tensor(self.alpha) # For TensorboardX logs vf = self.value( state_batch ) # separate function approximator for the soft value can stabilize training. with torch.no_grad(): vf_next_target = self.value_target(next_state_batch) next_q_value = reward_batch + mask_batch * self.gamma * ( vf_next_target) else: alpha_loss = torch.tensor(0.).to(self.device) alpha_logs = self.alpha # For TensorboardX logs with torch.no_grad(): next_state_action, _, _, _, _, = self.policy.sample( next_state_batch) # Use a target critic network for deterministic policy and eradicate the value value network completely. qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) if self.policy_type == "Gaussian": vf_target = min_qf_pi - (self.alpha * log_pi) value_loss = F.mse_loss( vf, vf_target.detach() ) # JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - α * log π(at|st)]))^2] policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean( ) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] # Regularization Loss # mean_loss = 0.001 * mean.pow(2).mean() # std_loss = 0.001 * log_std.pow(2).mean() # policy_loss += mean_loss + std_loss self.critic_optim.zero_grad() qf1_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() qf2_loss.backward() self.critic_optim.step() if self.policy_type == "Gaussian": self.value_optim.zero_grad() value_loss.backward() self.value_optim.step() else: value_loss = torch.tensor(0.).to(self.device) self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() """ We update the target weights to match the current value function weights periodically Update target parameter after every n(args.target_update_interval) updates """ if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic": soft_update(self.critic_target, self.critic, self.tau) elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian": soft_update(self.value_target, self.value, self.tau) return value_loss.item(), qf1_loss.item(), qf2_loss.item( ), policy_loss.item(), alpha_loss.item(), alpha_logs.item() # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None, value_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) if value_path is None: value_path = "models/sac_value_{}_{}".format(env_name, suffix) print('Saving models to {}, {} and {}'.format(actor_path, critic_path, value_path)) torch.save(self.value.state_dict(), value_path) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path, value_path): print('Loading models from {}, {} and {}'.format( actor_path, critic_path, value_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path)) if value_path is not None: self.value.load_state_dict(torch.load(value_path))
class A2CAgent(): """ init function input: env, which is the CartPole-v0 gamma, 0.99 in this case lr, learning rate is 1e-4 define: env = env, which is the CartPole-v0 obs_dim: 4 obervations Observation: Type: Box(4) Num Observation Min Max 0 Cart Position -4.8 4.8 1 Cart Velocity -Inf Inf 2 Pole Angle -24 deg 24 deg 3 Pole Velocity At Tip -Inf Inf action_dim: 2 actions Actions: Type: Discrete(2) Num Action 0 Push cart to the left 1 Push cart to the right value_network: two layer network with input 4 (observation dim) and output 1 (reward?) policy_network: two layer network with input 4 (observation dim) and output 2 (action dim) value and policy optimizer using default Adam and learning rate """ def __init__(self, env, gamma, lr): self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) """ input state to get the next action using policy network to get the next state by using softmax """ def get_action(self, state): state = torch.FloatTensor(state) logits = self.policy_network.forward(state) dist = F.softmax(logits, dim=0) probs = Categorical(dist) return probs.sample().cpu().detach().item() """ form trajectory get all of the information, and calculated the discounted_rewards use value network to train the states with new values and compute the loss between the value and target value by using MSE same logic for policy network FloatTensor = FLOAT TYPE ARRAY t tensor([[1, 2, 3], [4, 5, 6]]) t.view(-1,1) tensor([[1], [2], [3], [4], [5], [6]]) """ def compute_loss(self, trajectory): states = torch.FloatTensor([sars[0] for sars in trajectory]) actions = torch.LongTensor([sars[1] for sars in trajectory]).view(-1, 1) rewards = torch.FloatTensor([sars[2] for sars in trajectory]) next_states = torch.FloatTensor([sars[3] for sars in trajectory]) dones = torch.FloatTensor([sars[4] for sars in trajectory]).view(-1, 1) # compute value target ## Two for loop to calculate the discounted reward for each one discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma ** i for i in range(rewards[j:].size(0))]) \ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = rewards.view( -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.policy_network.forward(states) dists = F.softmax(logits, dim=1) probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() advantage = value_targets - values policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantage.detach() policy_loss = policy_loss.mean() - 0.001 * entropy return value_loss, policy_loss """ zero_grad clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls). loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation. opt.step() causes the optimizer to take a step based on the gradients of the parameters. """ def update(self, trajectory): value_loss, policy_loss = self.compute_loss(trajectory) self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
class DDPG: def __init__(self, cfg): self.device = cfg.device self.gamma = cfg.gamma self.batch_size = cfg.batch_size self.value_net = ValueNetwork(cfg.state_dim, cfg.action_dim, cfg.hidden_dim).to(self.device) self.policy_net = PolicyNetwork(cfg.state_dim, cfg.action_dim, cfg.hidden_dim).to(self.device) self.target_value_net = ValueNetwork(cfg.state_dim, cfg.action_dim, cfg.hidden_dim).to(self.device) self.target_value_net.load_state_dict(self.value_net.state_dict()) self.target_policy_net = PolicyNetwork(cfg.state_dim, cfg.action_dim, cfg.hidden_dim).to(self.device) self.target_policy_net.load_state_dict(self.policy_net.state_dict()) self.soft_tau = cfg.soft_tau self.value_lr = cfg.value_lr self.policy_lr = cfg.policy_lr self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=self.value_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=self.policy_lr) # mean squared error self.value_criterion = nn.MSELoss() self.replay_buffer = ReplayBuffer(cfg.replay_buffer_size) def update(self, cfg): state, action, reward, next_state, done = self.replay_buffer.sample( cfg.batch_size) # print(np.shape(state), np.shape(action), np.shape(reward), np.shape(next_state), np.shape(done)) # (128, 3) (128, 1) (128,) (128, 3) (128,) state = torch.FloatTensor(state).to(cfg.device) action = torch.FloatTensor(action).to(cfg.device) reward = torch.FloatTensor(reward).unsqueeze(1).to(cfg.device) next_state = torch.FloatTensor(next_state).to(cfg.device) done = torch.FloatTensor(done).unsqueeze(1).to(cfg.device) self.value_net(state, self.policy_net(state)) # Actor Loss policy_loss = self.value_net(state, self.policy_net(state)) policy_loss = -policy_loss.mean() next_action = self.target_policy_net(next_state) target_value = self.target_value_net(next_state, next_action.detach()) TD_target = reward + (1.0 - done) * self.gamma * target_value TD_target = torch.clamp(TD_target, -np.inf, np.inf) value = self.value_net(state, action) # Critic Loss value_loss = self.value_criterion(value, TD_target.detach()) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() # Update target network for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau) for target_param, param in zip(self.target_policy_net.parameters(), self.policy_net.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau)
class SAC(object): def __init__(self, config, env): self.device = config.device self.gamma = config.gamma # 折扣因子 self.tau = config.tau # 学习率 self.value_lr = config.value_lr self.soft_q_lr = config.soft_q_lr self.policy_lr = config.policy_lr self.replace_target_iter = config.replace_target_iter # 目标网络更新频率 self.replay_size = config.replay_size # 经验池大小 self.batch_size = config.batch_size # 批样本数 self.num_states = env.observation_space.shape[0] # 状态空间维度 self.num_actions = env.action_space.shape[0] # 动作空间维度 self.learn_start = self.batch_size * 3 # 控制学习的参数 self.learn_step_counter = 0 # 学习的总步数 self.memory = ReplayMemory(self.replay_size) # 初始化经验池 # 初始化V网络 self.value_net = ValueNetwork(self.num_states, 256).to(self.device) # 初始化V目标网络 self.target_value_net = ValueNetwork(self.num_states, 256).to(self.device) # V目标网络和V网络初始时参数一致 for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param.data) # 初始化Q网络 self.soft_q_net = SoftQNetwork(self.num_states, self.num_actions, 256).to(self.device) # 初始化策略网络 self.policy_net = PolicyNetwork(self.num_states, self.num_actions, 256).to(self.device) # 训练的优化器 self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=self.value_lr) self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(), lr=self.soft_q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=self.policy_lr) # 均方损失函数 self.value_criterion = nn.MSELoss() self.soft_q_criterion = nn.MSELoss() # 储存记忆 def store_transition(self, state, action, reward, next_state, done): self.memory.push((state, action, reward, next_state, done)) # 选择动作 def choose_action(self, s): s = torch.FloatTensor(s).to(self.device) mean, log_std = self.policy_net(s) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.detach().cpu().numpy() return action[0] # 获取动作的log_prob def get_action_log_prob(self, s, epsilon=1e-6): mean, log_std = self.policy_net(s) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon) log_prob = log_prob.sum(-1, keepdim=True) # log_prob = Normal(mean, std).log_prob(mean + std * z.to(self.device)) - torch.log(1 - action.pow(2) + epsilon) # reparameterization return action, log_prob, z, mean, log_std # 从经验池中选取样本 def get_batch(self): transitions, _, _ = self.memory.sample(self.batch_size) # 批样本 # 解压批样本 # 例如zipped为[(1, 4), (2, 5), (3, 6)],zip(*zipped)解压为[(1, 2, 3), (4, 5, 6)] batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip( *transitions) # 将样本转化为tensor batch_state = torch.tensor(batch_state, device=self.device, dtype=torch.float) batch_action = torch.tensor(batch_action, device=self.device, dtype=torch.float).squeeze().view( -1, 1) # view转换为列tensor batch_reward = torch.tensor(batch_reward, device=self.device, dtype=torch.float).squeeze().view(-1, 1) batch_next_state = torch.tensor(batch_next_state, device=self.device, dtype=torch.float) batch_done = torch.tensor(batch_done, device=self.device, dtype=torch.float).squeeze().view(-1, 1) # print("状态:", batch_state.shape) 128,4 # print("动作:", batch_action.shape) # print("奖励:", batch_reward.shape) # print("done:", batch_done.shape) # return batch_state, batch_action, batch_reward, batch_next_state, batch_done, _, _ # 学习 def learn(self): # 获取批样本 batch_state, batch_action, batch_reward, batch_next_state, batch_done, _, _ = self.get_batch( ) # print("状态:", batch_state) # print("动作:", batch_action) # print("done:", batch_done) expected_q_value = self.soft_q_net(batch_state, batch_action) # q(s,a) expected_value = self.value_net(batch_state) # v(s) new_action, log_prob, z, mean, log_std = self.get_action_log_prob( batch_state) # a~, logpi(a~|s), dist, 均值,标准差 target_value = self.target_value_net(batch_next_state) # vtar(s') next_q_value = batch_reward + ( 1 - batch_done) * self.gamma * target_value # r + gamma*(1-d)*vtar(s') q_value_loss = self.soft_q_criterion(expected_q_value, next_q_value.detach()).mean() expected_new_q_value = self.soft_q_net(batch_state, new_action) # q(s,a~) next_value = expected_new_q_value - log_prob value_loss = self.value_criterion(expected_value, next_value.detach()).mean() log_prob_target = expected_new_q_value - expected_value # q(s,a) - v(s) policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean() self.soft_q_optimizer.zero_grad() q_value_loss.backward() self.soft_q_optimizer.step() self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) # 学习的步数加一 self.learn_step_counter += 1 # 保存模型 def save(self): torch.save(self.soft_q_net, 'sac1_q.pkl') torch.save(self.value_net, 'sac1_v.pkl') torch.save(self.policy_net, 'sac1_policy.pkl') # 加载模型 def load(self): self.soft_q_net = torch.load('sac1_q.pkl') self.value_net = torch.load('sac1_v.pkl') self.policy_net = torch.load('sac1_policy.pkl')
class SAC: def __init__(self, env_name, n_states, n_actions, memory_size, batch_size, gamma, alpha, lr, action_bounds, reward_scale): self.env_name = env_name self.n_states = n_states self.n_actions = n_actions self.memory_size = memory_size self.batch_size = batch_size self.gamma = gamma self.alpha = alpha self.lr = lr self.action_bounds = action_bounds self.reward_scale = reward_scale self.memory = Memory(memory_size=self.memory_size) self.device = "cuda" if torch.cuda.is_available() else "cpu" self.policy_network = PolicyNetwork( n_states=self.n_states, n_actions=self.n_actions, action_bounds=self.action_bounds).to(self.device) self.q_value_network1 = QvalueNetwork(n_states=self.n_states, n_actions=self.n_actions).to( self.device) self.q_value_network2 = QvalueNetwork(n_states=self.n_states, n_actions=self.n_actions).to( self.device) self.value_network = ValueNetwork(n_states=self.n_states).to( self.device) self.value_target_network = ValueNetwork(n_states=self.n_states).to( self.device) self.value_target_network.load_state_dict( self.value_network.state_dict()) self.value_target_network.eval() self.value_loss = torch.nn.MSELoss() self.q_value_loss = torch.nn.MSELoss() self.value_opt = Adam(self.value_network.parameters(), lr=self.lr) self.q_value1_opt = Adam(self.q_value_network1.parameters(), lr=self.lr) self.q_value2_opt = Adam(self.q_value_network2.parameters(), lr=self.lr) self.policy_opt = Adam(self.policy_network.parameters(), lr=self.lr) def store(self, state, reward, done, action, next_state): state = from_numpy(state).float().to("cpu") reward = torch.Tensor([reward]).to("cpu") done = torch.Tensor([done]).to("cpu") action = torch.Tensor([action]).to("cpu") next_state = from_numpy(next_state).float().to("cpu") self.memory.add(state, reward, done, action, next_state) def unpack(self, batch): batch = Transition(*zip(*batch)) states = torch.cat(batch.state).view(self.batch_size, self.n_states).to(self.device) rewards = torch.cat(batch.reward).view(self.batch_size, 1).to(self.device) dones = torch.cat(batch.done).view(self.batch_size, 1).to(self.device) actions = torch.cat(batch.action).view(-1, self.n_actions).to(self.device) next_states = torch.cat(batch.next_state).view( self.batch_size, self.n_states).to(self.device) return states, rewards, dones, actions, next_states def train(self): if len(self.memory) < self.batch_size: return 0, 0, 0 else: batch = self.memory.sample(self.batch_size) states, rewards, dones, actions, next_states = self.unpack(batch) # Calculating the value target reparam_actions, log_probs = self.policy_network.sample_or_likelihood( states) q1 = self.q_value_network1(states, reparam_actions) q2 = self.q_value_network2(states, reparam_actions) q = torch.min(q1, q2) target_value = q.detach() - self.alpha * log_probs.detach() value = self.value_network(states) value_loss = self.value_loss(value, target_value) # Calculating the Q-Value target with torch.no_grad(): target_q = self.reward_scale * rewards + \ self.gamma * self.value_target_network(next_states) * (1 - dones) q1 = self.q_value_network1(states, actions) q2 = self.q_value_network2(states, actions) q1_loss = self.q_value_loss(q1, target_q) q2_loss = self.q_value_loss(q2, target_q) policy_loss = (self.alpha * log_probs - q).mean() self.policy_opt.zero_grad() policy_loss.backward() self.policy_opt.step() self.value_opt.zero_grad() value_loss.backward() self.value_opt.step() self.q_value1_opt.zero_grad() q1_loss.backward() self.q_value1_opt.step() self.q_value2_opt.zero_grad() q2_loss.backward() self.q_value2_opt.step() self.soft_update_target_network(self.value_network, self.value_target_network) return value_loss.item(), 0.5 * ( q1_loss + q2_loss).item(), policy_loss.item() def choose_action(self, states): states = np.expand_dims(states, axis=0) states = from_numpy(states).float().to(self.device) action, _ = self.policy_network.sample_or_likelihood(states) return action.detach().cpu().numpy()[0] @staticmethod def soft_update_target_network(local_network, target_network, tau=0.005): for target_param, local_param in zip(target_network.parameters(), local_network.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data) def save_weights(self): torch.save(self.policy_network.state_dict(), self.env_name + "_weights.pth") def load_weights(self): self.policy_network.load_state_dict( torch.load(self.env_name + "_weights.pth")) def set_to_eval_mode(self): self.policy_network.eval()
class SAC(object): def __init__(self, num_inputs, action_space, args): self.num_inputs = num_inputs self.action_space = action_space.shape[0] self.gamma = args.gamma self.tau = args.tau self.scale_R = args.scale_R self.reparam = args.reparam self.deterministic = args.deterministic self.target_update_interval = args.target_update_interval self.policy = GaussianPolicy(self.num_inputs, self.action_space, args.hidden_size) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.critic = QNetwork(self.num_inputs, self.action_space, args.hidden_size) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) if self.deterministic == False: self.value = ValueNetwork(self.num_inputs, args.hidden_size) self.value_target = ValueNetwork(self.num_inputs, args.hidden_size) self.value_optim = Adam(self.value.parameters(), lr=args.lr) hard_update(self.value_target, self.value) self.value_criterion = nn.MSELoss() else: self.critic_target = QNetwork(self.num_inputs, self.action_space, args.hidden_size) hard_update(self.critic_target, self.critic) self.soft_q_criterion = nn.MSELoss() def select_action(self, state, eval=False): state = torch.FloatTensor(state).unsqueeze(0) if eval == False: self.policy.train() _, _, action, _, _ = self.policy.evaluate(state) else: self.policy.eval() _, _, _, action, _ = self.policy.evaluate(state) action = torch.tanh(action) action = action.detach().cpu().numpy() return action[0] def update_parameters(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch, updates): state_batch = torch.FloatTensor(state_batch) next_state_batch = torch.FloatTensor(next_state_batch) action_batch = torch.FloatTensor(action_batch) reward_batch = torch.FloatTensor(reward_batch) mask_batch = torch.FloatTensor(np.float32(mask_batch)) reward_batch = reward_batch.unsqueeze( 1) # reward_batch = [batch_size, 1] mask_batch = mask_batch.unsqueeze(1) # mask_batch = [batch_size, 1] """ Use two Q-functions to mitigate positive bias in the policy improvement step that is known to degrade performance of value based methods. Two Q-functions also significantly speed up training, especially on harder task. """ expected_q1_value, expected_q2_value = self.critic( state_batch, action_batch) new_action, log_prob, x_t, mean, log_std = self.policy.evaluate( state_batch, reparam=self.reparam) """ Including a separate function approximator for the soft value can stabilize training. """ expected_value = self.value(state_batch) target_value = self.value_target(next_state_batch) next_q_value = self.scale_R * reward_batch + mask_batch * self.gamma * target_value # Reward Scale * r(st,at) - γV(target)(st+1)) """ Soft Q-function parameters can be trained to minimize the soft Bellman residual JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] ∇JQ = ∇Q(st,at)(Q(st,at) - r(st,at) - γV(target)(st+1)) """ q1_value_loss = self.soft_q_criterion(expected_q1_value, next_q_value.detach()) q2_value_loss = self.soft_q_criterion(expected_q2_value, next_q_value.detach()) q1_new, q2_new = self.critic(state_batch, new_action) expected_new_q_value = torch.min(q1_new, q2_new) """ Including a separate function approximator for the soft value can stabilize training and is convenient to train simultaneously with the other networks Update the V towards the min of two Q-functions in order to reduce overestimation bias from function approximation error. JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - log π(at|st)]))^2] ∇JV = ∇V(st)(V(st) - Q(st,at) + logπ(at|st)) """ next_value = expected_new_q_value - log_prob value_loss = self.value_criterion(expected_value, next_value.detach()) log_prob_target = expected_new_q_value - expected_value if self.reparam == True: """ Reparameterization trick is used to get a low variance estimator f(εt;st) = action sampled from the policy εt is an input noise vector, sampled from some fixed distribution Jπ = 𝔼st∼D,εt∼N[logπ(f(εt;st)|st)−Q(st,f(εt;st))] ∇Jπ =∇log π + ([∇at log π(at|st) − ∇at Q(st,at)])∇f(εt;st) """ policy_loss = (log_prob - expected_new_q_value).mean() else: policy_loss = (log_prob * (log_prob - log_prob_target).detach() ).mean() # likelihood ratio gradient estimator # Regularization Loss mean_loss = 0.001 * mean.pow(2).mean() std_loss = 0.001 * log_std.pow(2).mean() policy_loss += mean_loss + std_loss self.critic_optim.zero_grad() q1_value_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() q2_value_loss.backward() self.critic_optim.step() if self.deterministic == False: self.value_optim.zero_grad() value_loss.backward() self.value_optim.step() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() """ We update the target weights to match the current value function weights periodically Update target parameter after every n(args.target_update_interval) updates """ if updates % self.target_update_interval == 0 and self.deterministic == True: soft_update(self.critic_target, self.critic, self.tau) return 0, q1_value_loss.item(), q2_value_loss.item( ), policy_loss.item() elif updates % self.target_update_interval == 0 and self.deterministic == False: soft_update(self.value_target, self.value, self.tau) return value_loss.item(), q1_value_loss.item(), q2_value_loss.item( ), policy_loss.item() # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None, value_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) if value_path is None: value_path = "models/sac_value_{}_{}".format(env_name, suffix) print('Saving models to {}, {} and {}'.format(actor_path, critic_path, value_path)) torch.save(self.value.state_dict(), value_path) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path, value_path): print('Loading models from {}, {} and {}'.format( actor_path, critic_path, value_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path)) if value_path is not None: self.value.load_state_dict(torch.load(value_path))
class SACAgent: def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env # self.action_range = [env.action_space.low, env.action_space.high] # TODO: as a simple demo, I changed here; for the implementation, we should pass this as parameters self.action_range = [[-1, 1], [-1, 1]] self.obs_dim = env.observation_space.shape[0] self.action_dim = 2 # self.action_dim = 1 # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.replay_buffer = BasicBuffer(buffer_maxlen) # pi: state -> acton def get_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) mean, log_std = self.policy_net.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): '''if action < 0.5: return 0 else: return 1''' scaled_action = [] for idx, a in enumerate(action): action_range = self.action_range[idx] a = (action_range[1] - action_range[0]) / 2.0 + ( action_range[1] + action_range[0]) / 2.0 scaled_action.append(a) return scaled_action def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) next_actions, next_log_pi = self.policy_net.sample(next_states) next_q1 = self.q_net1(next_states, next_actions) next_q2 = self.q_net2(next_states, next_actions) next_v = self.target_value_net(next_states) # value Loss next_v_target = torch.min(next_q1, next_q2) - next_log_pi curr_v = self.value_net.forward(states) v_loss = F.mse_loss(curr_v, next_v_target.detach()) #TODO: Question: why using 2 Q-networks? # To reduce bias in training. # q loss curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) expected_q = rewards + (1 - dones) * self.gamma * next_v q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update value network and q networks self.value_optimizer.zero_grad() v_loss.backward() self.value_optimizer.step() self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() # delayed update for policy net and target value nets # TODO: Question: what does this part do? # The original paper mentioned 2 methods for approximating the value function # 1. the EMA of policy weights to update the Q network # 2. periodical update of the policy network, which is used in this code if self.update_step % self.delay_step == 0: new_actions, log_pi = self.policy_net.sample(states) min_q = torch.min(self.q_net1.forward(states, new_actions), self.q_net2.forward(states, new_actions)) policy_loss = (log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) self.update_step += 1
class SACV(object): def __init__(self, input_size, action_size, gamma, tau, alpha, hidden_size, lr, device): self.gamma = gamma self.tau = tau self.alpha = alpha self.device = device self.policy = Actor(input_size, hidden_size, action_size).to(self.device) self.critic = Critic(input_size, hidden_size, action_size).to(self.device) self.value = ValueNetwork(input_size, hidden_size).to(self.device) self.policy_optim = torch.optim.Adam(self.policy.parameters(), lr=lr) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=lr) self.value_optim = torch.optim.Adam(self.value.parameters(), lr=lr) self.value_target = copy.deepcopy(self.value) self.value_target.requires_grad_(False) def select_action(self, obs, sample=True): obs = torch.FloatTensor(obs).to(self.device).unsqueeze(0) policy = TanhGaussian(*self.policy(obs)) action = policy.sample(sample) action = action.detach().cpu().numpy()[0] return action def update_parameters(self, batch): obs, act, rew, done, obs_next = batch obs = torch.FloatTensor(obs).to(self.device) act = torch.FloatTensor(act).to(self.device) rew = torch.FloatTensor(rew).unsqueeze(-1).to(self.device) done = torch.BoolTensor(done).unsqueeze(-1).to(self.device) obs_next = torch.FloatTensor(obs_next).to(self.device) with torch.no_grad(): next_v = self.value_target(obs_next).masked_fill(done, 0.) q_targ = rew + self.gamma * next_v self.critic_optim.zero_grad() q1, q2 = self.critic(obs, act) critic_loss = (q1 - q_targ).pow(2.).mul(0.5) + ( q2 - q_targ).pow(2.).mul(0.5) critic_loss = critic_loss.mean() critic_loss.backward() self.critic_optim.step() with torch.no_grad(): critic_loss = (torch.min(q1, q2) - q_targ).pow(2).mul(0.5).mean() self.policy_optim.zero_grad() policy = TanhGaussian(*self.policy(obs)) action = policy.sample() log_pi = policy.log_prob(action, param_grad=False).sum(dim=-1, keepdim=True) action_value = torch.min(*self.critic(obs, action)) with torch.no_grad(): v_targ = action_value - self.alpha * log_pi self.value_optim.zero_grad() v = self.value(obs) value_loss = (v - v_targ).pow(2.).mul(0.5) value_loss = value_loss.mean() value_loss.backward() self.value_optim.step() policy_loss = self.alpha * log_pi - action_value policy_loss = policy_loss.mean() policy_loss.backward() self.policy_optim.step() soft_update(self.value_target, self.value, self.tau) loss_info = { 'critic_loss': critic_loss.item(), 'policy_loss': policy_loss.item(), 'value_loss': value_loss.item(), 'policy_entropy': -log_pi.mean().item() } return loss_info