class BQN(nn.Module): def __init__(self, state_space: int, action_num: int, action_scale: int, learning_rate, device: str): super(BQN, self).__init__() self.q = QNetwork(state_space, action_num, action_scale).to(device) self.target_q = QNetwork(state_space, action_num, action_scale).to(device) self.target_q.load_state_dict(self.q.state_dict()) self.optimizer = optim.Adam([\ {'params' : self.q.linear_1.parameters(),'lr': learning_rate / (action_num+2)},\ {'params' : self.q.linear_2.parameters(),'lr': learning_rate / (action_num+2)},\ {'params' : self.q.value.parameters(), 'lr' : learning_rate/ (action_num+2)},\ {'params' : self.q.actions.parameters(), 'lr' : learning_rate},\ ]) self.update_freq = 1000 self.update_count = 0 def action(self, x): return self.q(x) def train_mode(self, n_epi, memory, batch_size, gamma, use_tensorboard, writer): state, actions, reward, next_state, done_mask = memory.sample( batch_size) actions = torch.stack(actions).transpose(0, 1).unsqueeze(-1) done_mask = torch.abs(done_mask - 1) cur_actions = self.q(state) cur_actions = torch.stack(cur_actions).transpose(0, 1) cur_actions = cur_actions.gather(2, actions.long()).squeeze(-1) target_cur_actions = self.target_q(next_state) target_cur_actions = torch.stack(target_cur_actions).transpose(0, 1) target_cur_actions = target_cur_actions.max(-1, keepdim=True)[0] target_action = (done_mask * gamma * target_cur_actions.mean(1) + reward) loss = F.mse_loss(cur_actions, target_action.repeat(1, 4)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.update_count += 1 if (self.update_count % self.update_freq == 0) and (self.update_count > 0): self.update_count = 0 self.target_q.load_state_dict(self.q.state_dict()) if use_tensorboard: writer.add_scalar("Loss/loss", loss, n_epi) return loss
class Brain: def __init__(self, params): self.num_actions = params['num_actions'] self.device = params['device'] self.batch_size = params['batch_size'] self.learning_rate = params['learning_rate'] self.gamma = params['gamma'] self.eps_start = params['eps_start'] self.eps_end = params['eps_end'] self.eps_decay = params['eps_decay'] self.policy_net = QNetwork(self.num_actions).to(self.device) self.target_net = QNetwork(self.num_actions).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.memory = ReplayMemory(params['replay_memory_size']) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) self.steps_done = 0 self.q_vals = [0] * self.num_actions self.loss = 0 def decide_action(self, state): eps_threshold = self.eps_end + ( self.eps_start - self.eps_end) * math.exp( -1. * self.steps_done / self.eps_decay) self.steps_done += 1 with torch.no_grad(): self.q_vals = self.policy_net( torch.from_numpy(state).float().to(self.device).unsqueeze(0)) sample = random.random() if sample > eps_threshold: with torch.no_grad(): return self.q_vals.max(1)[1].view(1, 1) else: return torch.tensor([[random.randrange(self.num_actions)]], device=self.device, dtype=torch.long) def optimize(self): transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.bool) non_final_next_states = torch.cat([ torch.tensor(s, device=self.device, dtype=torch.float) for s in batch.next_state if s is not None ]) state_batch = torch.cat( [torch.tensor(batch.state, device=self.device, dtype=torch.float)]) action_batch = torch.cat( [torch.tensor(batch.action, device=self.device, dtype=torch.long)]) reward_batch = torch.cat( [torch.tensor(batch.reward, device=self.device, dtype=torch.int)]) state_action_values = self.policy_net(state_batch).gather( 1, action_batch.unsqueeze(1)) next_state_values = torch.zeros(self.batch_size, device=self.device) next_state_values[non_final_mask] = self.target_net( non_final_next_states.unsqueeze(1)).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch self.loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) self.optimizer.zero_grad() self.loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def update_target_network(self): self.target_net.load_state_dict(self.policy_net.state_dict())