Python QNetTwin.get_q1_q2 Exemples

Langage de programmation: Python

Espace de nommage/Pack: elegantrl.tutorial.net

Class/Type: QNetTwin

Méthode/Fonction: get_q1_q2

Exemples au hotexamples.com: 2

Python QNetTwin.get_q1_q2 - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de elegantrl.tutorial.net.QNetTwin.get_q1_q2 extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

QNetTwin(2)

get_q1_q2(2)

parameters(2)

get__q1_q2(1)

Méthodes fréquemment utilisées

QNetTwin (2)

get_q1_q2 (2)

parameters (2)

get__q1_q2 (1)

Exemple #1

0

Afficher le fichier

class AgentDoubleDQN(AgentDQN): def __init__(self): super().__init__() self.explore_rate = 0.25 # the probability of choosing action randomly in epsilon-greedy self.softmax = torch.nn.Softmax(dim=1) def init(self, net_dim, state_dim, action_dim): self.action_dim = action_dim self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.cri = QNetTwin(net_dim, state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.act = self.cri self.criterion = torch.nn.SmoothL1Loss() self.cri_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate) def select_action(self, state): # for discrete action space states = torch.as_tensor((state, ), dtype=torch.float32, device=self.device).detach_() actions = self.act(states) if rd.rand() < self.explore_rate: # epsilon-greedy action = self.softmax(actions)[0] a_prob = action.detach().cpu().numpy( ) # choose action according to Q value a_int = rd.choice(self.action_dim, p=a_prob) else: action = actions[0] a_int = action.argmax(dim=0).cpu().numpy() return a_int def update_net(self, buffer, target_step, batch_size, repeat_times): buffer.update_now_len_before_sample() next_q = obj_critic = None for _ in range(int(target_step * repeat_times)): with torch.no_grad(): reward, mask, action, state, next_s = buffer.sample_batch( batch_size) next_q = torch.min(*self.cri_target.get_q1_q2(next_s)) next_q = next_q.max(dim=1, keepdim=True)[0] q_label = reward + mask * next_q act_int = action.type(torch.long) q1, q2 = [ qs.gather(1, act_int) for qs in self.cri.get_q1_q2(state) ] obj_critic = self.criterion(q1, q_label) + self.criterion( q2, q_label) self.cri_optimizer.zero_grad() obj_critic.backward() self.cri_optimizer.step() self.soft_update(self.cri_target, self.cri) return next_q.mean().item(), obj_critic.item() / 2

Exemple #2

0

Afficher le fichier

class AgentDoubleDQN(AgentDQN): def __init__(self): super().__init__() self.explore_rate = 0.25 # the probability of choosing action randomly in epsilon-greedy self.softmax = torch.nn.Softmax(dim=1) def init(self, net_dim, state_dim, action_dim): self.action_dim = action_dim self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.cri = QNetTwin(net_dim, state_dim, action_dim).to(self.device) self.cri_target = QNetTwin(net_dim, state_dim, action_dim).to(self.device) self.act = self.cri self.cri_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate) def select_action(self, state) -> np.ndarray: # for discrete action space states = torch.as_tensor((state, ), dtype=torch.float32, device=self.device).detach_() actions = self.act(states) if rd.rand() < self.explore_rate: # epsilon-greedy action = self.softmax(actions)[0] a_prob = action.detach().cpu().numpy( ) # choose action according to Q value a_int = rd.choice(self.action_dim, p=a_prob) else: action = actions[0] a_int = action.argmax(dim=0).cpu().numpy() return a_int def get_obj_critic(self, buffer, batch_size) -> (torch.Tensor, torch.Tensor): with torch.no_grad(): reward, mask, action, state, next_s = buffer.sample_batch( batch_size) next_q = torch.min(*self.cri_target.get_q1_q2(next_s)) next_q = next_q.max(dim=1, keepdim=True)[0] q_label = reward + mask * next_q act_int = action.type(torch.long) q1, q2 = [qs.gather(1, act_int) for qs in self.act.get_q1_q2(state)] obj_critic = self.criterion(q1, q_label) + self.criterion(q2, q_label) return obj_critic, q1