class DuelingAgent: def __init__(self, env, use_conv=True, learning_rate=3e-4, gamma=0.99, buffer_size=10000): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.replay_buffer = BasicBuffer(max_size=buffer_size) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.use_conv = use_conv if self.use_conv: self.model = ConvDuelingDQN(env.observation_space.shape, env.action_space.n).to(self.device) else: self.model = DuelingDQN(env.observation_space.shape, env.action_space.n).to(self.device) self.optimizer = torch.optim.Adam(self.model.parameters()) self.MSE_loss = nn.MSELoss() def get_action(self, state, eps=0.20): state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device) qvals = self.model.forward(state) action = np.argmax(qvals.cpu().detach().numpy()) if (np.random.randn() > eps): return self.env.action_space.sample() return action def compute_loss(self, batch): states, actions, rewards, next_states, dones = batch states = torch.FloatTensor(states).to(self.device) actions = torch.LongTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) curr_Q = self.model.forward(states).gather(1, actions.unsqueeze(1)) curr_Q = curr_Q.squeeze(1) next_Q = self.model.forward(next_states) max_next_Q = torch.max(next_Q, 1)[0] expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q loss = self.MSE_loss(curr_Q, expected_Q) return loss def update(self, batch_size): batch = self.replay_buffer.sample(batch_size) loss = self.compute_loss(batch) self.optimizer.zero_grad() loss.backward() self.optimizer.step()
class C51Agent: def __init__(self, env, use_conv=True, learning_rate=3e-4, gamma=0.99, buffer_size=10000): self.env = env self.gamma = gamma self.replay_buffer = BasicBuffer(buffer_size) self.model = DistributionalDQN(self.env.observation_space.shape, self.env.action_space.n, use_conv) self.optimizer = torch.optim.Adam(self.model.parameters()) def get_action(self, state): state = autograd.Variable(torch.from_numpy(state).float().unsqueeze(0)) dist, qvals = self.model.forward(state) action = np.argmax(qvals.detach().numpy()) return action def compute_error(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) states = torch.FloatTensor(states) actions = torch.LongTensor(actions) rewards = torch.FloatTensor(rewards) next_states = torch.FloatTensor(next_states) dones = torch.FloatTensor(dones) curr_dist, _ = self.model.forward(states) curr_action_dist = curr_dist[range(batch_size), actions] next_dist, next_qvals = self.model.forward(next_states) next_actions = torch.max(next_qvals, 1)[1] next_dist = self.model.softmax(next_dist) optimal_dist = next_dist[range(batch_size), next_actions] projection = dist_projection(optimal_dist, rewards, dones, self.gamma, self.model.n_atoms, self.model.Vmin, self.model.Vmax, self.model.support) loss = -KL_divergence_two_dist(optimal_dist, projection) return loss def update(self, batch_size): loss = self.compute_error(batch_size) loss = loss.mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step()
class NoisyDQNAgent: def __init__(self, env, use_conv=True, learning_rate=1e-4, gamma=0.99, buffer_maxlen=100000): self.env = env self.use_conv = use_conv self.learning_rate = learning_rate self.gamma = gamma self.replay_buffer = BasicBuffer(buffer_maxlen) if self.use_conv: self.model = ConvNoisyDQN(env.observation_space.shape, env.action_space.n) else: self.model = NoisyDQN(self.env.observation_space.shape, self.env.action_space.n) self.MSE_loss = nn.MSELoss() self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate) def get_action(self, state): state = autograd.Variable(torch.FloatTensor(state).unsqueeze(0)) qvals = self.model.forward(state) action = np.argmax(qvals.detach().numpy()) return action def compute_loss(self, batch): states, actions, rewards, next_states, dones = batch states = torch.FloatTensor(states) actions = torch.LongTensor(actions) rewards = torch.FloatTensor(rewards) next_states = torch.FloatTensor(next_states) dones = torch.FloatTensor(dones) curr_Q = self.model.forward(states).gather(1, actions.unsqueeze(1)) curr_Q = curr_Q.squeeze(1) next_Q = self.model.forward(next_states) max_next_Q = torch.max(next_Q, 1)[0] expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q loss = self.MSE_loss(curr_Q, expected_Q) return loss def update(self, batch_size): batch = self.replay_buffer.sample(batch_size) loss = self.compute_loss(batch) self.optimizer.zero_grad() loss.backward() self.optimizer.step()
class SACAgent: def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.action_range = [env.action_space.low, env.action_space.high] self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.replay_buffer = BasicBuffer(buffer_maxlen) def get_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) mean, log_std = self.policy_net.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ (self.action_range[1] + self.action_range[0]) / 2.0 def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) next_actions, next_log_pi = self.policy_net.sample(next_states) next_q1 = self.q_net1(next_states, next_actions) next_q2 = self.q_net2(next_states, next_actions) next_v = self.target_value_net(next_states) # value Loss next_v_target = torch.min(next_q1, next_q2) - next_log_pi curr_v = self.value_net.forward(states) v_loss = F.mse_loss(curr_v, next_v_target.detach()) # q loss curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) expected_q = rewards + (1 - dones) * self.gamma * next_v q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update value network and q networks self.value_optimizer.zero_grad() v_loss.backward() self.value_optimizer.step() self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() #delayed update for policy net and target value nets if self.update_step % self.delay_step == 0: new_actions, log_pi = self.policy_net.sample(states) min_q = torch.min(self.q_net1.forward(states, new_actions), self.q_net2.forward(states, new_actions)) policy_loss = (log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) self.update_step += 1
class QCommander(Commander): def __init__(self, thisRegiment, enemyRegiment, nbatcommand, gamma=0.95, buffer_size=256): super().__init__(thisRegiment, enemyRegiment, nbatcommand) self.gamma = gamma self.replay_buffer = BasicBuffer(max_size=buffer_size) self.device = None self.model = None self.optimizer = None self.MSE_loss = None def set_model(self): self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') input_dim = self.thisRegiment_size + self.enemyRegiment_size output_dim = len(self.order_action_map) self.model = DQN(input_dim, output_dim).to(self.device) self.optimizer = torch.optim.Adam(self.model.parameters()) self.MSE_loss = nn.MSELoss() def order(self, state, eps=0.2): ''' state: [array of regiment1 health + array of regiment2 health] e.g. action: (4, None, None, 4) -> battalion 4 will not attack any enemy battalion (wasteful!). Only happen when agent choose according to the q-table (early stage) ''' if np.random.uniform(0, 1) < eps: # Make sure all actions are chosen. Otherwise, some are not going to get visited and updated. thisaction = random.sample(self.action_order_map.keys(), self.nbatcommand)[0] return self.action_order_map[thisaction], thisaction state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device) #self.model.eval() # need this when forward passing one sample into nn with batchnorm layer qvals = self.model.forward(state) action = np.argmax(qvals.cpu().detach().numpy()) return self.action_order_map[action], action def compute_loss(self, batch): states, actions, rewards, next_states, dones = batch states = torch.FloatTensor(states).to(self.device) actions = torch.LongTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = np.array([int(done) for done in dones]) dones = torch.FloatTensor(dones).to(self.device) curr_Q = self.model.forward(states).gather( 1, actions.unsqueeze(1)) # [batch_size, 1] curr_Q = curr_Q.squeeze(1) next_Q = self.model.forward(next_states) # [batch_size, naction] max_next_Q = torch.max(next_Q, 1)[0] # [batch_size, 1] expected_Q = rewards.squeeze(1) + self.gamma * (1 - dones) * max_next_Q loss = self.MSE_loss(curr_Q, expected_Q) return loss def update(self, batch_size): #batch =self.replay_buffer.sample_sequence(batch_size) batch = self.replay_buffer.sample(batch_size) loss = self.compute_loss(batch) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item()
class DQNAgent: def __init__(self, env, use_conv=True, learning_rate=3e-4, gamma=0.99, buffer_size=10000, resume=False): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.replay_buffer = BasicBuffer(max_size=buffer_size) self.epsilon = 1 self.epsilon_min = 0.001 self.epsilon_decay = 0.0005 self.losses = [] self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.path = 'saved-models/qwop_cnn.game.model' self.model = CNN() self.date = datetime.now().strftime("%b-%d-%Y-%H-%M-%S") self.save_path = 'saved-models/' + self.date + '-qwop_cnn.game' + '.model' if resume: self.model.load_state_dict(torch.load(self.path)) self.model.eval() f = open('states/epsilon_decay.txt') self.epsilon = float(f.readline()) f.close() self.optimizer = torch.optim.Adam(self.model.parameters()) self.MSE_loss = nn.MSELoss() def get_action(self, state): state = torch.unsqueeze(state, 0).float().to(self.device) qvals = self.model.forward(state) action = np.argmax(qvals.cpu().detach().numpy()) if self.epsilon > self.epsilon_min: self.epsilon *= (1 - self.epsilon_decay) if (np.random.randn() < self.epsilon): return self.env.action_space.sample() return action def compute_loss(self, batch): states, actions, rewards, next_states, dones = batch actions = torch.LongTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) dones = torch.FloatTensor(dones) curr_Q = [] next_Q = [] for (old, new) in zip(states, next_states): curr_Q_state = self.model.forward(torch.unsqueeze(old, 0).float()) next_Q_state = self.model.forward(torch.unsqueeze(new, 0).float()) curr_Q.append(curr_Q_state.tolist()) next_Q.append(next_Q_state.tolist()) curr_Q = torch.FloatTensor(curr_Q) next_Q = torch.FloatTensor(next_Q) # print("curr:", curr_Q) # print("action:", actions) curr_Q = curr_Q.squeeze(1) curr_Q = curr_Q.gather(1, actions.unsqueeze(1)) next_Q = next_Q.squeeze(1) max_next_Q = torch.max(next_Q, 1)[0] expected_Q = rewards.squeeze(1) + (1 - dones) * self.gamma * max_next_Q # print("curr:", curr_Q) # print("next:", next_Q) # print("max:", max_next_Q) # print("exp:", expected_Q) curr_Q = curr_Q.squeeze(1) curr_Q.requires_grad_() expected_Q.requires_grad_() loss = self.MSE_loss(curr_Q, expected_Q) return loss def update(self, batch_size): try: batch = self.replay_buffer.sample(batch_size) loss = self.compute_loss(batch) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.losses.append(loss.item()) except ValueError: print('Error') pass def update_buffer(self, prev_state, action, reward, next_state, done): self.replay_buffer.push(prev_state, action, reward, next_state, done) def save_model(self): torch.save(self.model.state_dict(), self.save_path) f = open("states/epsilon_decay_" + self.date + '.txt', "w") f.write(str(self.epsilon)) f.close()
class TD3Agent(): def __init__(self, env: object, gamma: float, delay_step: int, tau: float, buffer_maxlen: int, noise_std: float, noise_bound: float, critic_lr: float, actor_lr: float): # Selecting the device to use, wheter CUDA (GPU) if available or CPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # Creating the Gym environments for training and evaluation self.env = env # Get max and min values of the action of this environment self.action_range = [ self.env.action_space.low, self.env.action_space.high ] # Get dimension of of the state and the state self.obs_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] # Total_step initialization self.steps = 0 # hyperparameters self.gamma = gamma self.tau = tau self.critic_lr = critic_lr self.actor_lr = actor_lr self.buffer_maxlen = buffer_maxlen self.noise_std = noise_std self.noise_bound = noise_bound self.delay_step = delay_step # Scaling and bias factor for the actions -> We need scaling of the actions because each environment has different min and max values of actions self.scale = (self.action_range[1] - self.action_range[0]) / 2.0 self.bias = (self.action_range[1] + self.action_range[0]) / 2.0 # initialize networks self.critic1 = Critic(self.obs_dim, self.action_dim).to(self.device) self.target_critic1 = Critic(self.obs_dim, self.action_dim).to(self.device) self.critic2 = Critic(self.obs_dim, self.action_dim).to(self.device) self.target_critic2 = Critic(self.obs_dim, self.action_dim).to(self.device) self.actor = Actor(self.obs_dim, self.action_dim).to(self.device) self.target_actor = Actor(self.obs_dim, self.action_dim).to(self.device) # copy weight parameters to the target Q network and actor network for target_param, param in zip(self.target_critic1.parameters(), self.critic1.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_critic2.parameters(), self.critic2.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(param) # initialize optimizers self.critic1_optimizer = optim.Adam(self.critic1.parameters(), lr=self.critic_lr) self.critic2_optimizer = optim.Adam(self.critic2.parameters(), lr=self.critic_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.actor_lr) # Create a replay buffer self.replay_buffer = BasicBuffer(self.buffer_maxlen) def update(self, batch_size: int, steps: int): self.steps = steps # Sampling experiences from the replay buffer states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) # Convert numpy arrays of experience tuples into pytorch tensors states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) # Critic update (computing the loss # Sample actions for the next states (s_t+1) using the target actor next_actions = self.target_actor.forward(next_states) next_actions = self.rescale_action(next_actions) # Adding gaussian noise to the actions noise = self.get_noise(next_actions, self.noise_std + 0.1, -self.noise_bound, self.noise_bound) noisy_next_actions = next_actions + noise # Compute Q(s_t+1,a_t+1) next_q1 = self.target_critic1(next_states, noisy_next_actions) next_q2 = self.target_critic2(next_states, noisy_next_actions) # Choose minimum Q min_q = torch.min(next_q1, next_q2) # Find expected Q, i.e., r(t) + gamma*next_q expected_q = rewards + (1 - dones) * self.gamma * min_q # Find current Q values for the given states and actions from replay buffer curr_q1 = self.critic1.forward(states, actions) curr_q2 = self.critic2.forward(states, actions) # Compute loss between Q network and expected Q critic1_loss = F.mse_loss(curr_q1, expected_q.detach()) critic2_loss = F.mse_loss(curr_q2, expected_q.detach()) # Backpropagate the losses and update Q network parameters self.critic1_optimizer.zero_grad() critic1_loss.backward() self.critic1_optimizer.step() self.critic2_optimizer.zero_grad() critic2_loss.backward() self.critic2_optimizer.step() # actor update (computing the loss) if self.steps % self.delay_step == 0: # Sample new actions for the current states (s_t) using the current actor new_actions = self.actor.forward(states) # Compute Q(s_t,a_t) new_q1 = self.critic1.forward(states, new_actions) # Compute the actor loss, i.e., -Q1 actor_loss = -new_q1.mean() # Backpropagate the losses and update actor network parameters self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the target networks for target_param, param in zip(self.target_critic1.parameters(), self.critic1.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) for target_param, param in zip(self.target_critic2.parameters(), self.critic2.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) def get_noise(self, action: torch.Tensor, sigma: float, bottom: float, top: float) -> torch.Tensor: # sigma: standard deviation of the noise # bottom,top: minimum and maximum values for the given noiuse return torch.normal(torch.zeros(action.size()), sigma).clamp(bottom, top).to(self.device) def get_action(self, state: np.ndarray, stochastic: bool) -> np.ndarray: # state: the state input to the pi network # stochastic: boolean (True -> use noisy action, False -> use noiseless,deterministic action) # Convert state numpy to tensor state = torch.FloatTensor(state).unsqueeze(0).to(self.device) action = self.actor.forward(state) if stochastic: # Add gaussian noise to the rescaled action action = self.rescale_action(action) + self.get_noise( action, self.noise_std, -self.noise_bound, self.noise_bound) else: action = self.rescale_action(action) # Convert action tensor to numpy action = action.squeeze(0).cpu().detach().numpy() return action def rescale_action(self, action: torch.Tensor) -> torch.Tensor: # we use a rescaled action since the output of the actor network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value # scale -> scalar multiplication # bias -> scalar offset return action * self.scale[0] + self.bias[0] def Actor_save(self, WORKSPACE: str): # save 각 node별 모델 저장 print("Save the torch model") savePath = WORKSPACE + "./actor_model5_Hop_.pth" torch.save(self.actor.state_dict(), savePath) def Actor_load(self, WORKSPACE: str): # save 각 node별 모델 로드 print("load the torch model") savePath = WORKSPACE + "./actor_model5_Hop_.pth" # Best self.actor = Actor(self.obs_dim, self.action_dim).to(self.device) self.actor.load_state_dict(torch.load(savePath))
class TD3Agent: """ Each joint will be the agent. Thus we will have one action (Agent) value on each joint. """ def __init__(self, env: object, gamma: float, tau: float, buffer_maxlen: int, delay_step: int, noise_std: float, noise_bound: float, critic_lr: float, actor_lr: float): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # Environment로 부터 State(observation), Action space 설정 self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau self.noise_std = noise_std self.noise_bound = noise_bound self.update_step = 0 self.delay_step = delay_step self.buffer_maxlen = buffer_maxlen self.critic1 = [] self.critic2 = [] self.critic_target1 = [] self.critic_target2 = [] self.actor = [] self.actor_target = [] self.critic_optimizer1 = [] self.critic_optimizer2 = [] self.actor_optimizer = [] # initialize actor and critic networks depends on the action_dims(because it's MA) for _ in range(self.action_dim): self.critic1.append( Critic(self.obs_dim, self.action_dim).to(self.device)) self.critic2.append( Critic(self.obs_dim, self.action_dim).to(self.device)) self.critic_target1.append( Critic(self.obs_dim, self.action_dim).to(self.device)) self.critic_target2.append( Critic(self.obs_dim, self.action_dim).to(self.device)) for _ in range(self.action_dim): self.actor.append( Actor(self.obs_dim, self.action_dim).to(self.device)) self.actor_target.append( Actor(self.obs_dim, self.action_dim).to(self.device)) # Copy critic target parameters for i in range(self.action_dim): for target_param, param in zip(self.critic_target1[i].parameters(), self.critic1[i].parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.critic_target2[i].parameters(), self.critic2[i].parameters()): target_param.data.copy_(param.data) # initialize optimizers for i in range(self.action_dim): self.critic_optimizer1.append( optim.Adam(self.critic1[i].parameters(), lr=critic_lr)) self.critic_optimizer2.append( optim.Adam(self.critic2[i].parameters(), lr=critic_lr)) self.actor_optimizer.append( optim.Adam(self.actor[i].parameters(), lr=actor_lr)) self.replay_buffer = BasicBuffer(10000) self.replay_buffer_base = BasicBuffer(self.buffer_maxlen) def get_action(self, obs: np.ndarray) -> Tuple[list, list]: # Action 을 얻기 위해 state를 받는다. state = torch.FloatTensor(obs).unsqueeze(0).to(self.device) action_list = [] action_list_ = [] # 그후, 각 node별로 NN으로 부터 Inference를 하고 이를 List에 append한다. 이때 acion은 학습용으로 with Noise, action_은 Test용으로 without Noise. for i in range(self.action_dim): action_list.append( (self.actor[i].forward(state[0, i])).cpu().detach() + (self.generate_action_space_noise(0.4)).cpu().detach()) action_list_.append( (self.actor[i].forward(state[0, i])).cpu().detach()) action = action_list action_ = action_list_ return action, action_ def update(self, batch_size: int, step_env: int): # Replay Buffer로 부터 batch Sample state_batch, action_batch, reward_batch, next_state_batch, dones = self.replay_buffer.sample( batch_size) # Batch_sample Data variable 초기화 state_batch = np.array(state_batch) action_batch = np.array(action_batch) reward_batch = np.array(reward_batch) next_state_batch = np.array(next_state_batch) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) # 각 Node별 Update진행 for i in range(self.action_dim): # null Action 확인 후 관련 Node 모두 0으로 초기화 action_null = np.where(action_batch[:, i] == 0) state_batch_null = state_batch[:, i] state_batch_null[action_null] = 0 state_batch_null = torch.FloatTensor(state_batch_null).to( self.device) action_batch_null = action_batch[:, i] action_batch_null[action_null] = 0 action_batch_null = torch.FloatTensor(action_batch_null).to( self.device) next_state_batch_null = next_state_batch[:, i] next_state_batch_null[action_null] = 0 next_state_batch_null = torch.FloatTensor( next_state_batch_null).to(self.device) reward_batch_null = reward_batch reward_batch_null[action_null] = 0 reward_batch_null = torch.FloatTensor(reward_batch_null).to( self.device) # Add Noise for next action action_space_noise = self.generate_action_space_noise(0.2) next_actions = self.actor[i].forward( next_state_batch_null) + action_space_noise # To make expected Q-value(s_t+1) next_Q1 = self.critic_target1[i].forward(next_state_batch_null, next_actions) next_Q2 = self.critic_target2[i].forward(next_state_batch_null, next_actions) expected_Q = reward_batch_null + ( 1 - dones) * self.gamma * torch.min(next_Q1, next_Q2) expected_Q = expected_Q.cpu().detach().numpy() expected_Q[action_null] = 0 expected_Q = torch.FloatTensor(expected_Q).to(self.device) # To remove the effect of null node, Masking array 생성 masking_torch = np.ones([100, 1]) masking_torch[action_null] = 0 masking_torch = torch.FloatTensor(masking_torch).to(self.device) # 학습 위해 Critic value inference curr_Q1 = self.critic1[i].forward(state_batch_null, action_batch_null.reshape(-1, 1)) curr_Q1 *= masking_torch.detach() curr_Q2 = self.critic2[i].forward(state_batch_null, action_batch_null.reshape(-1, 1)) curr_Q2 *= masking_torch.detach() # Critic value inference and Critic value(S_+1) (Q(s_t) -(r+Q(s_t+1) )^2 critic1_loss = F.mse_loss(curr_Q1, expected_Q.detach()) critic2_loss = F.mse_loss(curr_Q2, expected_Q.detach()) # Do the optimizer self.critic_optimizer1[i].zero_grad() critic1_loss.backward() self.critic_optimizer1[i].step() self.critic_optimizer2[i].zero_grad() critic2_loss.backward() self.critic_optimizer2[i].step() # delyaed update for actor & target networks if (self.update_step % self.delay_step == 0): # actor new_actions = self.actor[i](state_batch_null) policy_gradient = -self.critic1[i](state_batch_null, new_actions) policy_gradient *= masking_torch.detach() policy_gradient = policy_gradient.mean() self.actor_optimizer[i].zero_grad() policy_gradient.backward() self.actor_optimizer[i].step() # target networks self.update_targets(i) self.update_step += 1 def generate_action_space_noise(self, noise_std: float) -> torch.Tensor: noise = torch.normal(torch.zeros(1), noise_std).clamp(-self.noise_bound, self.noise_bound).to(self.device) return noise def update_targets(self, i: int): for target_param, param in zip(self.critic_target1[i].parameters(), self.critic1[i].parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip(self.critic_target2[i].parameters(), self.critic2[i].parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip(self.actor_target[i].parameters(), self.actor[i].parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) def Actor_save(self): # save 각 node별 모델 저장 print("Save the torch model") for i in range(self.action_dim): savePath = "./actor_model5_Hop_" + str(i) + ".pth" torch.save(self.actor[i].state_dict(), savePath) def Actor_load(self): # save 각 node별 모델 로드 print("load the torch model") for i in range(self.action_dim): savePath = "./actor_model_wlk" + str(i) + ".pth" # Best self.actor[i] = Actor(self.obs_dim, self.action_dim).to(self.device) self.actor[i].load_state_dict(torch.load(savePath))
class SACAgent(): def __init__(self, env: object, gamma: float, tau: float, buffer_maxlen: int, critic_lr: float, actor_lr: float, reward_scale: int): # Selecting the device to use, wheter CUDA (GPU) if available or CPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # Creating the Gym environments for training and evaluation self.env = env # Get max and min values of the action of this environment self.action_range = [ self.env.action_space.low, self.env.action_space.high ] # Get dimension of of the state and the action self.obs_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau self.critic_lr = critic_lr self.actor_lr = actor_lr self.buffer_maxlen = buffer_maxlen self.reward_scale = reward_scale # Scaling and bias factor for the actions -> We need scaling of the actions because each environment has different min and max values of actions self.scale = (self.action_range[1] - self.action_range[0]) / 2.0 self.bias = (self.action_range[1] + self.action_range[0]) / 2.0 # initialize networks self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy weight parameters to the target Q networks for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(param) # initialize optimizers self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=self.critic_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=self.critic_lr) self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=self.actor_lr) # Create a replay buffer self.replay_buffer = BasicBuffer(self.buffer_maxlen) def update(self, batch_size: int): # Sampling experiences from the replay buffer states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) # Convert numpy arrays of experience tuples into pytorch tensors states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = self.reward_scale * torch.FloatTensor(rewards).to( self.device) # in SAC we do reward scaling for the sampled rewards next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) # Critic update (computing the loss) # Please refer to equation (6) in the paper for details # Sample actions for the next states (s_t+1) using the current policy next_actions, next_log_pi, _, _ = self.policy.sample( next_states, self.scale) next_actions = self.rescale_action(next_actions) # Compute Q(s_t+1,a_t+1) by giving the states and actions to the Q network and choose the minimum from 2 target Q networks next_q1 = self.target_q_net1(next_states, next_actions) next_q2 = self.target_q_net2(next_states, next_actions) min_q = torch.min(next_q1, next_q2) # find minimum between next_q1 and next_q2 # Compute the next Q_target (Q(s_t,a_t)-alpha(next_log_pi)) next_q_target = (min_q - next_log_pi) # Compute the Q(s_t,a_t) using s_t and a_t from the replay buffer curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) # Find expected Q, i.e., r(t) + gamma*next_q_target expected_q = rewards + (1 - dones) * self.gamma * next_q_target # Compute loss between Q network and expected Q q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # Backpropagate the losses and update Q network parameters self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() # Policy update (computing the loss) # Sample new actions for the current states (s_t) using the current policy new_actions, log_pi, _, _ = self.policy.sample(states, self.scale) new_actions = self.rescale_action(new_actions) # Compute Q(s_t,a_t) and choose the minimum from 2 Q networks new_q1 = self.q_net1.forward(states, new_actions) new_q2 = self.q_net2.forward(states, new_actions) min_q = torch.min(new_q1, new_q2) # Compute the next policy loss, i.e., alpha*log_pi - Q(s_t,a_t) eq. (7) policy_loss = (log_pi - min_q).mean() # Backpropagate the losses and update policy network parameters self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # Updating target networks with soft update using update rate tau for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) def get_action( self, state: np.ndarray, stochastic: bool) -> Tuple[np.ndarray, torch.Tensor, torch.Tensor]: # state: the state input to the pi network # stochastic: boolean (True -> use noisy action, False -> use noiseless (deterministic action)) state = torch.FloatTensor(state).unsqueeze(0).to(self.device) # Get mean and sigma from the policy network mean, log_std = self.policy.forward(state) std = log_std.exp() # Stochastic mode is used for training, non-stochastic mode is used for evaluation if stochastic: normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() else: normal = Normal(mean, 0) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() # return a rescaled action, and also the mean and standar deviation of the action # we use a rescaled action since the output of the policy network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value return self.rescale_action(action), mean, std def rescale_action(self, action: np.ndarray) -> np.ndarray: # we use a rescaled action since the output of the policy network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value # scale -> scalar multiplication # bias -> scalar offset return action * self.scale[0] + self.bias[0] def Actor_save(self, WORKSPACE: str): # save 각 node별 모델 저장 print("Save the torch model") savePath = WORKSPACE + "./policy_model5_Hop_.pth" torch.save(self.policy.state_dict(), savePath) def Actor_load(self, WORKSPACE: str): # save 각 node별 모델 로드 print("load the torch model") savePath = WORKSPACE + "./policy_model5_Hop_.pth" # Best self.policy = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy.load_state_dict(torch.load(savePath))
class SACAgent: def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env # self.action_range = [env.action_space.low, env.action_space.high] # TODO: as a simple demo, I changed here; for the implementation, we should pass this as parameters self.action_range = [[-1, 1], [-1, 1]] self.obs_dim = env.observation_space.shape[0] self.action_dim = 2 # self.action_dim = 1 # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.replay_buffer = BasicBuffer(buffer_maxlen) # pi: state -> acton def get_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) mean, log_std = self.policy_net.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): '''if action < 0.5: return 0 else: return 1''' scaled_action = [] for idx, a in enumerate(action): action_range = self.action_range[idx] a = (action_range[1] - action_range[0]) / 2.0 + ( action_range[1] + action_range[0]) / 2.0 scaled_action.append(a) return scaled_action def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) next_actions, next_log_pi = self.policy_net.sample(next_states) next_q1 = self.q_net1(next_states, next_actions) next_q2 = self.q_net2(next_states, next_actions) next_v = self.target_value_net(next_states) # value Loss next_v_target = torch.min(next_q1, next_q2) - next_log_pi curr_v = self.value_net.forward(states) v_loss = F.mse_loss(curr_v, next_v_target.detach()) #TODO: Question: why using 2 Q-networks? # To reduce bias in training. # q loss curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) expected_q = rewards + (1 - dones) * self.gamma * next_v q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update value network and q networks self.value_optimizer.zero_grad() v_loss.backward() self.value_optimizer.step() self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() # delayed update for policy net and target value nets # TODO: Question: what does this part do? # The original paper mentioned 2 methods for approximating the value function # 1. the EMA of policy weights to update the Q network # 2. periodical update of the policy network, which is used in this code if self.update_step % self.delay_step == 0: new_actions, log_pi = self.policy_net.sample(states) min_q = torch.min(self.q_net1.forward(states, new_actions), self.q_net2.forward(states, new_actions)) policy_loss = (log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) self.update_step += 1
class TD3Agent: def __init__(self, env, gamma, tau, buffer_maxlen, delay_step, noise_std, noise_bound, critic_lr, actor_lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau self.noise_std = noise_std self.noise_bound = noise_bound self.update_step = 0 self.delay_step = delay_step # initialize actor and critic networks self.critic1 = Critic(self.obs_dim, self.action_dim).to(self.device) self.critic2 = Critic(self.obs_dim, self.action_dim).to(self.device) self.critic1_target = Critic(self.obs_dim, self.action_dim).to(self.device) self.critic2_target = Critic(self.obs_dim, self.action_dim).to(self.device) self.actor = Actor(self.obs_dim, self.action_dim).to(self.device) self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device) # Copy critic target parameters for target_param, param in zip(self.critic1_target.parameters(), self.critic1.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.critic2_target.parameters(), self.critic2.parameters()): target_param.data.copy_(param.data) # initialize optimizers self.critic1_optimizer = optim.Adam(self.critic1.parameters(), lr=critic_lr) self.critic2_optimizer = optim.Adam(self.critic1.parameters(), lr=critic_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.replay_buffer = BasicBuffer(buffer_maxlen) def get_action(self, obs): state = torch.FloatTensor(obs).unsqueeze(0).to(self.device) action = self.actor.forward(state) action = action.squeeze(0).cpu().detach().numpy() return action def update(self, batch_size): state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample( batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) masks = torch.FloatTensor(masks).to(self.device) action_space_noise = self.generate_action_space_noise(action_batch) next_actions = self.actor.forward(state_batch) + action_space_noise next_Q1 = self.critic1_target.forward(next_state_batch, next_actions) next_Q2 = self.critic2_target.forward(next_state_batch, next_actions) expected_Q = reward_batch + self.gamma * torch.min(next_Q1, next_Q2) # critic loss curr_Q1 = self.critic1.forward(state_batch, action_batch) curr_Q2 = self.critic2.forward(state_batch, action_batch) critic1_loss = F.mse_loss(curr_Q1, expected_Q.detach()) critic2_loss = F.mse_loss(curr_Q2, expected_Q.detach()) # update critics self.critic1_optimizer.zero_grad() critic1_loss.backward() self.critic1_optimizer.step() self.critic2_optimizer.zero_grad() critic2_loss.backward() self.critic2_optimizer.step() # delyaed update for actor & target networks if (self.update_step % self.delay_step == 0): # actor self.actor_optimizer.zero_grad() policy_gradient = -self.critic1(state_batch, self.actor(state_batch)).mean() policy_gradient.backward() self.actor_optimizer.step() # target networks self.update_targets() self.update_step += 1 def generate_action_space_noise(self, action_batch): noise = torch.normal(torch.zeros(action_batch.size()), self.noise_std).clamp(-self.noise_bound, self.noise_bound).to( self.device) return noise def update_targets(self): for target_param, param in zip(self.critic1_target.parameters(), self.critic1.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip(self.critic2_target.parameters(), self.critic2.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))
class DDPGAgent: def __init__(self, env, gamma, tau, buffer_maxlen, critic_learning_rate, actor_learning_rate): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.env = env self.gamma = gamma self.tau = tau # initialize actor and critic networks self.critic = Critic(self.obs_dim, self.action_dim).to(self.device) self.critic_target = Critic(self.obs_dim, self.action_dim).to(self.device) self.actor = Actor(self.obs_dim, self.action_dim).to(self.device) self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device) # Copy critic target parameters for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) # optimizers self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate) self.replay_buffer = BasicBuffer(buffer_maxlen) self.noise = OUNoise(self.env.action_space) def get_action(self, obs): state = torch.FloatTensor(obs).unsqueeze(0).to(self.device) action = self.actor.forward(state) action = action.squeeze(0).cpu().detach().numpy() return action def update(self, batch_size): states, actions, rewards, next_states, _ = self.replay_buffer.sample(batch_size) state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample(batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) masks = torch.FloatTensor(masks).to(self.device) curr_Q = self.critic.forward(state_batch, action_batch) next_actions = self.actor_target.forward(next_state_batch) next_Q = self.critic_target.forward(next_state_batch, next_actions.detach()) expected_Q = reward_batch + self.gamma * next_Q # update critic q_loss = F.mse_loss(curr_Q, expected_Q.detach()) self.critic_optimizer.zero_grad() q_loss.backward() self.critic_optimizer.step() # update actor policy_loss = -self.critic.forward(state_batch, self.actor.forward(state_batch)).mean() self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() # update target networks for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))
class DoubleDQNAgent: def __init__(self, env, use_conv=True, learning_rate=3e-4, gamma=0.99, tau=0.01, buffer_size=10000): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.tau = tau self.replay_buffer = BasicBuffer(max_size=buffer_size) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.use_conv = use_conv if self.use_conv: self.model1 = ConvDQN(env.observation_space.shape, env.action_space.n).to(self.device) self.model2 = ConvDQN(env.observation_space.shape, env.action_space.n).to(self.device) else: self.model1 = DQN(env.observation_space.shape, len(env.action_space)).to(self.device) self.model2 = DQN(env.observation_space.shape, len(env.action_space)).to(self.device) self.optimizer1 = torch.optim.Adam(self.model1.parameters()) self.optimizer2 = torch.optim.Adam(self.model2.parameters()) def get_action(self, state, eps=0.20): if (np.random.randn() < eps): return np.random.choice(self.env.action_space) state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device) qvals = self.model1.forward(state) action = np.argmax(qvals.cpu().detach().numpy()) return action def compute_loss(self, batch): states, actions, rewards, next_states, dones = batch states = torch.FloatTensor(states).to(self.device) actions = torch.LongTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) # resize tensors actions = actions.view(actions.size(0), 1) dones = dones.view(dones.size(0), 1) # compute loss curr_Q1 = self.model1.forward(states).gather(1, actions) curr_Q2 = self.model2.forward(states).gather(1, actions) next_Q1 = self.model1.forward(next_states) next_Q2 = self.model2.forward(next_states) next_Q = torch.min( torch.max(self.model1.forward(next_states), 1)[0], torch.max(self.model2.forward(next_states), 1)[0]) next_Q = next_Q.view(next_Q.size(0), 1) expected_Q = rewards + (1 - dones) * self.gamma * next_Q loss1 = F.mse_loss(curr_Q1, expected_Q.detach()) loss2 = F.mse_loss(curr_Q2, expected_Q.detach()) return loss1, loss2 def update(self, batch_size): batch = self.replay_buffer.sample(batch_size) loss1, loss2 = self.compute_loss(batch) self.optimizer1.zero_grad() loss1.backward() self.optimizer1.step() self.optimizer2.zero_grad() loss2.backward() self.optimizer2.step()