Beispiel #1
0
    def train_sl(self):
        assert len(self.buffer_sl) >= args.batch_size
        state, action = self.buffer_sl.sample(args.batch_size)

        state = U.Variable(torch.FloatTensor(state.astype(np.float32)))
        action = U.Variable(torch.LongTensor(action))

        logits = self.model_sl(state)
        action_one_hot = F.one_hot(action, action.size()[0])
        logits_action = logits.gather(1, action.unsqueeze(1)).squeeze(1)

        loss_sl = -(torch.log(logits_action)).mean()
        self.optimizer_sl.zero_grad()
        loss_sl.backward()
        self.optimizer_sl.step()
Beispiel #2
0
 def average_stargiey(self, state):
     state = U.Variable(
         torch.FloatTensor(state.astype(np.float32)).unsqueeze(0))
     logits = self.model_sl(state)
     action_max_value, index = torch.max(logits, 1)
     action = index.item()
     return action
Beispiel #3
0
 def best_response(self, state):
     state = U.Variable(
         torch.FloatTensor(state.astype(np.float32)).unsqueeze(0))
     q_value = self.model_rl(state)
     action_max_value, index = torch.max(q_value, 1)
     action = index.item()
     return action
Beispiel #4
0
 def max_action(self, state):
     if self.bool_defaule_action:
         return 2
     else:
         state = U.Variable(
             torch.FloatTensor(state.astype(np.float32)).unsqueeze(0))
         q_value = self.model(state)
         action_max_value, index = torch.max(q_value, 1)
         action = index.item()
         return action
Beispiel #5
0
    def train_rl(self):
        assert len(self.buffer_rl) >= args.batch_size
        state, action, reward, next_state, done = self.buffer_rl.sample(
            args.batch_size)

        state = U.Variable(torch.FloatTensor(state.astype(np.float32)))
        next_state = U.Variable(
            torch.FloatTensor(next_state.astype(np.float32)))
        action = U.Variable(torch.LongTensor(action))
        reward = U.Variable(torch.FloatTensor(reward))
        done = U.Variable(torch.FloatTensor(done))

        q_values = self.model_rl(state)
        if self.flag_target_net:
            next_q_values = self.target_model_rl(next_state)
        else:
            next_q_values = self.model_rl(next_state)

        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        next_q_value_max = next_q_values.max(1)[0]
        expected_q_value = reward + self.gamma * next_q_value_max * (1 - done)

        loss_rl = (q_value -
                   U.Variable(expected_q_value.detach())).pow(2).mean()
        self.optimizer_rl.zero_grad()
        loss_rl.backward()
        self.optimizer_rl.step()
Beispiel #6
0
    def egreedy_action(self, state, epsilon_decay=1):
        if epsilon_decay:
            if self.epsilon > 0.1:
                self.epsilon = self.epsilon - 0.000005
            else:
                self.epsilon = self.epsilon * args.decay_rate

        if random.random() > self.epsilon:
            state = U.Variable(
                torch.FloatTensor(state.astype(np.float32)).unsqueeze(0))
            q_value = self.model(state)
            action_max_value, index = torch.max(q_value, 1)
            action = index.item()
            # action  = q_value.max(1)[1].data[0]


#             print(action)
        else:
            action = random.randrange(self.n_action)
        return action