def _init_agents(self):
     # parameter sharing
     self.n_heads = self.config['n_heads']
     self.embedding = Embedding_Layer(self.input_dim,
                                      self.hidden_dim).to(self.device)
     self.attention = Double_Attention_Model(self.hidden_dim,
                                             self.n_heads).to(self.device)
     self.embedding_target = Embedding_Layer(
         self.input_dim, self.hidden_dim).to(self.device)
     self.attention_target = Double_Attention_Model(
         self.hidden_dim, self.n_heads).to(self.device)
     Dueling_DDQN_Learner.copy_network(self.embedding,
                                       self.embedding_target)
     Dueling_DDQN_Learner.copy_network(self.attention,
                                       self.attention_target)
     self.share_para = chain(self.embedding.parameters(),
                             self.attention.parameters())
     self.all_para = chain(self.embedding.parameters(),
                           self.attention.parameters())
     # init the optimizer
     for i in range(self.num_agents):
         self.agents.append(Dueling_DDQN_Learner(self.config))
         self.all_para = chain(self.all_para,
                               self.agents[i].get_q_network().parameters())
     self.share_optimizer = optim.RMSprop(self.all_para,
                                          lr=self.lr,
                                          weight_decay=1e-4)
Exemple #2
0
 def learn(self):
     # if self.curr_step > 0 and self.curr_step % self.update_step == 0:
     for i in range(self.update_step):
         states, actions, rewards, next_states, is_dones = self.sample_experience(
         )
         actions = torch.from_numpy(actions).long().to(self.device)
         rewards = torch.from_numpy(rewards).float().to(self.device)
         is_dones = torch.from_numpy(is_dones).float().to(self.device)
         # states_embedding = self._get_embedding(states)
         # next_states_embedding = self._get_embedding(next_states)
         # next_states_embedding_target = self._get_embedding_target(next_states)
         total_loss = 0
         for i in range(states.shape[1]):
             actions_values_current = self.learner.cal_current_actions_value(
                 states[:, i], next_states[:, i], rewards[:, i], is_dones)
             actions_values_expected = self.learner.cal_expected_actions_value(
                 states[:, i], actions[:, i])
             loss = F.mse_loss(actions_values_expected,
                               actions_values_current)
             # loss.backward(retain_graph=True)
             total_loss += loss
             # 反向传播
             # self.optimizer[i].zero_grad()
         total_loss /= self.num_agents
         self.share_optimizer.zero_grad()
         total_loss.backward()
         # self._scale_shared_grads()
         torch.nn.utils.clip_grad_value_(self.all_para, 1)
         self.share_optimizer.step()
         Dueling_DDQN_Learner.soft_update_of_target_network(
             self.embedding, self.embedding_target, self.tau)
Exemple #3
0
 def _init_agents(self):
     # parameter sharing
     self.embedding = Base_Model(self.input_dim, self.hidden_dim,
                                 self.output_dim).to(self.device)
     self.embedding_target = Base_Model(self.input_dim, self.hidden_dim,
                                        self.output_dim).to(self.device)
     # Dueling_DDQN_Learner.copy_network(self.embedding, self.embedding_target)
     # init the optimizer
     self.learner = Dueling_DDQN_Learner(self.config)
     # for i in range(self.num_agents):
     #     self.agents.append(Dueling_DDQN_Learner(self.config))
     self.learner.set_q_network(self.embedding, self.embedding_target)
     self.all_para = self.embedding.parameters()
     self.share_optimizer = optim.RMSprop(self.all_para,
                                          lr=self.lr,
                                          weight_decay=1e-4)
    def _init_agents(self):
        self.embedding = Embedding_Layer(self.input_dim,
                                         self.hidden_dim).to(self.device)
        self.embedding_target = Embedding_Layer(
            self.input_dim, self.hidden_dim).to(self.device)
        Dueling_DDQN_Learner.copy_network(self.embedding,
                                          self.embedding_target)

        self.share_para = self.embedding.parameters()
        self.all_para = self.embedding.parameters()
        # init the optimizer
        for i in range(self.num_agents):
            self.agents.append(Dueling_DDQN_Learner(self.config))
            self.all_para = chain(self.all_para,
                                  self.agents[i].get_q_network().parameters())
            # para = chain(self.embedding.parameters(), self.agents[i].get_q_network().parameters())
            # self.optimizer.append(optim.Adam(self.agents[i].get_q_network().parameters(), lr=1e-3))
        # self.all_para = chain(self.all_para)
        self.share_optimizer = optim.RMSprop(self.all_para,
                                             lr=self.lr,
                                             weight_decay=1e-4)
 def _update_sharing_target_network(self):
     Dueling_DDQN_Learner.soft_update_of_target_network(
         self.embedding, self.embedding_target, self.tau)
     Dueling_DDQN_Learner.soft_update_of_target_network(
         self.rnn, self.rnn_target, self.tau)
     Dueling_DDQN_Learner.soft_update_of_target_network(
         self.attention, self.attention_target, self.tau)
    def _init_agents(self):
        self.embedding = Embedding_Layer(self.input_dim,
                                         self.hidden_dim).to(self.device)
        self.attention = Attention_Model(self.hidden_dim).to(self.device)
        self.temporal_attention = Attention_Model(self.hidden_dim).to(
            self.device)
        self.embedding_target = Embedding_Layer(
            self.input_dim, self.hidden_dim).to(self.device)
        self.attention_target = Attention_Model(self.hidden_dim).to(
            self.device)
        self.temporal_attention_target = Attention_Model(self.hidden_dim).to(
            self.device)
        Dueling_DDQN_Learner.copy_network(self.embedding,
                                          self.embedding_target)
        Dueling_DDQN_Learner.copy_network(self.attention,
                                          self.attention_target)
        Dueling_DDQN_Learner.copy_network(self.temporal_attention,
                                          self.temporal_attention_target)
        for i in range(self.num_agents):
            q_network = Double_Attention_Model(self.input_dim, self.output_dim,
                                               self.hidden_dim).to(self.device)
            q_network_target = Double_Attention_Model(
                self.input_dim, self.output_dim,
                self.hidden_dim).to(self.device)
            q_network.set_layer_para(self.embedding, self.attention,
                                     self.temporal_attention)
            q_network_target.set_layer_para(self.embedding_target,
                                            self.attention_target,
                                            self.temporal_attention_target)
            self.agents[i].set_q_network(q_network, q_network_target)


# def change_mode(self):
# self.q_network.change_mode()
# self.q_network_target.change_mode()
# for i in range(self.num_agents):
#     self.agents[i].q_network_current.change_mode()
#     self.agents[i].q_network_target.change_mode()
# self.embedding = Embedding_Layer(self.input_dim, self.hidden_dim[0])
# self.attention = Attention_Layer(self.hidden_dim[0], self.hidden_dim[1], self.hidden_dim[2])
# self.embedding_target = Embedding_Layer(self.input_dim, self.hidden_dim[0])
# self.attention_target = Attention_Layer(self.hidden_dim[0], self.hidden_dim[1], self.hidden_dim[2])
# Dueling_DDQN_Learner.copy_network(self.embedding, self.embedding_target)
# Dueling_DDQN_Learner.copy_network(self.attention, self.attention_target)
# def get_action(self, i, obs):
#     return self.agents[i].step(obs)
# def store_experience(self, i, obs, action, reward, next_obs, is_done):
#     self.agents[i].store_experience(obs, action, reward, next_obs, is_done)
Exemple #7
0
class IQL_Agents(Basic_Agents):
    def __init__(self, config, num_agents, input_dim, hidden_dim, output_dim):
        super().__init__(config, num_agents, input_dim, hidden_dim, output_dim)
        self._init_agents()

    def _init_agents(self):
        # parameter sharing
        self.embedding = Base_Model(self.input_dim, self.hidden_dim,
                                    self.output_dim).to(self.device)
        self.embedding_target = Base_Model(self.input_dim, self.hidden_dim,
                                           self.output_dim).to(self.device)
        # Dueling_DDQN_Learner.copy_network(self.embedding, self.embedding_target)
        # init the optimizer
        self.learner = Dueling_DDQN_Learner(self.config)
        # for i in range(self.num_agents):
        #     self.agents.append(Dueling_DDQN_Learner(self.config))
        self.learner.set_q_network(self.embedding, self.embedding_target)
        self.all_para = self.embedding.parameters()
        self.share_optimizer = optim.RMSprop(self.all_para,
                                             lr=self.lr,
                                             weight_decay=1e-4)

    def learn(self):
        # if self.curr_step > 0 and self.curr_step % self.update_step == 0:
        for i in range(self.update_step):
            states, actions, rewards, next_states, is_dones = self.sample_experience(
            )
            actions = torch.from_numpy(actions).long().to(self.device)
            rewards = torch.from_numpy(rewards).float().to(self.device)
            is_dones = torch.from_numpy(is_dones).float().to(self.device)
            # states_embedding = self._get_embedding(states)
            # next_states_embedding = self._get_embedding(next_states)
            # next_states_embedding_target = self._get_embedding_target(next_states)
            total_loss = 0
            for i in range(states.shape[1]):
                actions_values_current = self.learner.cal_current_actions_value(
                    states[:, i], next_states[:, i], rewards[:, i], is_dones)
                actions_values_expected = self.learner.cal_expected_actions_value(
                    states[:, i], actions[:, i])
                loss = F.mse_loss(actions_values_expected,
                                  actions_values_current)
                # loss.backward(retain_graph=True)
                total_loss += loss
                # 反向传播
                # self.optimizer[i].zero_grad()
            total_loss /= self.num_agents
            self.share_optimizer.zero_grad()
            total_loss.backward()
            # self._scale_shared_grads()
            torch.nn.utils.clip_grad_value_(self.all_para, 1)
            self.share_optimizer.step()
            Dueling_DDQN_Learner.soft_update_of_target_network(
                self.embedding, self.embedding_target, self.tau)

    def step(self, state, test=False):
        action = []
        for i in range(state.shape[1]):
            action.append(self.learner.step(state[:, i], test))
        action = np.asarray(action)
        self.curr_step += 1
        return action