def train_sl(self): assert len(self.buffer_sl) >= args.batch_size state, action = self.buffer_sl.sample(args.batch_size) state = U.Variable(torch.FloatTensor(state.astype(np.float32))) action = U.Variable(torch.LongTensor(action)) logits = self.model_sl(state) action_one_hot = F.one_hot(action, action.size()[0]) logits_action = logits.gather(1, action.unsqueeze(1)).squeeze(1) loss_sl = -(torch.log(logits_action)).mean() self.optimizer_sl.zero_grad() loss_sl.backward() self.optimizer_sl.step()
def average_stargiey(self, state): state = U.Variable( torch.FloatTensor(state.astype(np.float32)).unsqueeze(0)) logits = self.model_sl(state) action_max_value, index = torch.max(logits, 1) action = index.item() return action
def best_response(self, state): state = U.Variable( torch.FloatTensor(state.astype(np.float32)).unsqueeze(0)) q_value = self.model_rl(state) action_max_value, index = torch.max(q_value, 1) action = index.item() return action
def max_action(self, state): if self.bool_defaule_action: return 2 else: state = U.Variable( torch.FloatTensor(state.astype(np.float32)).unsqueeze(0)) q_value = self.model(state) action_max_value, index = torch.max(q_value, 1) action = index.item() return action
def train_rl(self): assert len(self.buffer_rl) >= args.batch_size state, action, reward, next_state, done = self.buffer_rl.sample( args.batch_size) state = U.Variable(torch.FloatTensor(state.astype(np.float32))) next_state = U.Variable( torch.FloatTensor(next_state.astype(np.float32))) action = U.Variable(torch.LongTensor(action)) reward = U.Variable(torch.FloatTensor(reward)) done = U.Variable(torch.FloatTensor(done)) q_values = self.model_rl(state) if self.flag_target_net: next_q_values = self.target_model_rl(next_state) else: next_q_values = self.model_rl(next_state) q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) next_q_value_max = next_q_values.max(1)[0] expected_q_value = reward + self.gamma * next_q_value_max * (1 - done) loss_rl = (q_value - U.Variable(expected_q_value.detach())).pow(2).mean() self.optimizer_rl.zero_grad() loss_rl.backward() self.optimizer_rl.step()
def egreedy_action(self, state, epsilon_decay=1): if epsilon_decay: if self.epsilon > 0.1: self.epsilon = self.epsilon - 0.000005 else: self.epsilon = self.epsilon * args.decay_rate if random.random() > self.epsilon: state = U.Variable( torch.FloatTensor(state.astype(np.float32)).unsqueeze(0)) q_value = self.model(state) action_max_value, index = torch.max(q_value, 1) action = index.item() # action = q_value.max(1)[1].data[0] # print(action) else: action = random.randrange(self.n_action) return action